File size: 3,420 Bytes
1086ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
import openai
import numpy as np
# import matplotlib
# print(matplotlib.matplotlib_fname())
import sys
sys.path.append('.')
from local_config import openai_key


def cluster_text(text_list, n_clusters=20, openai_api_key=openai_key):
    # Set OpenAI API key
    openai.api_key = openai_api_key
    model = "text-embedding-ada-002"
    # Convert text_list to numerical data using OpenAI API
    data = []
    for text in text_list:
        emb_req = openai.Embedding.create(input=[text], model=model)
        embeddings = emb_req.data[0].embedding
        data.append(embeddings)
    data = np.array(data)

    # Cluster the data
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(data)

    # Get the cluster centers
    centers = kmeans.cluster_centers_

    # Get the distances to each center
    # distances = kmeans.transform(data)
    distances = euclidean_distances(data, centers)

    # Get the indices of the samples with the largest distance to their center
    indices = np.argmax(distances, axis=0)

    # Get the samples with the largest distance to their center
    samples = []
    seen_samples = set()
    for i in indices:
        sample = text_list[i]
        if sample not in seen_samples:
            samples.append(sample)
            seen_samples.add(sample)
        else:
            sorted_indices = np.argsort(distances[:, i])
            for j in sorted_indices[::-1]:
                sample = text_list[j]
                if sample not in seen_samples:
                    samples.append(sample)
                    seen_samples.add(sample)
                    break

    # Return samples as list of strings
    return samples


def plot_clusters(text_list, n_clusters=20, openai_api_key=openai_key):
    # Set OpenAI API key
    openai.api_key = openai_api_key
    model = "text-embedding-ada-002"
    # Convert text_list to numerical data using OpenAI API
    data = []
    for text in text_list:
        emb_req = openai.Embedding.create(input=[text], model=model)
        embeddings = emb_req.data[0].embedding
        data.append(embeddings)
    data = np.array(data)

    # Cluster the data
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(data)

    # Reduce the dimensionality of the data
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(data)

    # Plot the reduced data
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_)
    for i, text in enumerate(text_list):
        plt.annotate(text, (reduced_data[i, 0], reduced_data[i, 1]))
    plt.show()


if __name__ == "__main__":
    test_data = [
        '一百多和三十的也看不出什么区别,包装精美,质量应该不错。',
        '质量很好 料子很不错 做工细致 样式好看 穿着很漂亮',
        ' 会卷的    建议买大的小的会卷   胖就别买了       没用',
        '大差了  布料很差  我也不想多说',
        '一点也不好,我买的东西拿都拿到快递员自己签收了还不给我,恶心恶心恶心,不要脸不要脸'
    ]

    result = cluster_text(test_data, n_clusters=3)
    plot_clusters(test_data, n_clusters=3)

    print(result)