wxy01giser commited on
Commit
ba441c8
·
verified ·
1 Parent(s): 53c9ed9

Update cluster_insight.py

Browse files
Files changed (1) hide show
  1. cluster_insight.py +225 -225
cluster_insight.py CHANGED
@@ -1,225 +1,225 @@
1
- '''
2
- 基于向量聚类方式的可视化模块
3
- '''
4
- # cluster_insight.py
5
- import pandas as pd
6
- import numpy as np
7
- from sklearn.cluster import KMeans
8
- from sklearn.metrics import silhouette_score
9
- from sklearn.manifold import TSNE
10
- import plotly.graph_objects as go
11
- import matplotlib.cm as cm
12
- import matplotlib.colors as mcolors
13
- from wordcloud import WordCloud
14
- import matplotlib.pyplot as plt
15
- import base64
16
- from io import BytesIO
17
- from sentence_transformers import SentenceTransformer, util
18
- import os
19
- import pickle
20
- # cluster_insight.py → 新增函数
21
- from sklearn.feature_extraction.text import TfidfVectorizer
22
- import jieba
23
-
24
-
25
- def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
26
- """
27
- 自动提取聚类关键词
28
- :param sentences: 所有句子
29
- :param labels: 聚类标签
30
- :param cluster_id: 当前聚类
31
- :param top_n: 提取前 n 个词
32
- :return: 关键词字符串
33
- """
34
- # 1. 提取该聚类所有句子
35
- cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
36
- if not cluster_texts:
37
- return "无数据"
38
-
39
- # 2. 分词(保护领域词)
40
- DOMAIN_SET = {
41
- # 中文通用领域词(去重 + 合并)
42
- "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
43
- "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
44
- "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
45
- "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
46
- "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
47
- "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
48
-
49
- # 英文通用领域词(去重)
50
- "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
51
- "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
52
- "spatial join", "data manager", "POI",
53
- }
54
- for word in DOMAIN_SET:
55
- jieba.add_word(word, freq=10000)
56
- # 1. 基础停用词(通用功能词)
57
- STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
58
- "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
59
- "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
60
- "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
61
- "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
62
- }
63
- def tokenize(text):
64
- words = jieba.lcut(text)
65
- return [
66
- w for w in words
67
- if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
68
- ]
69
-
70
- tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
71
-
72
- # 3. TF-IDF 提取关键词
73
- vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
74
- try:
75
- tfidf_matrix = vectorizer.fit_transform(tokenized)
76
- feature_names = vectorizer.get_feature_names_out()
77
-
78
- # 取平均 TF-IDF 最高的词
79
- mean_tfidf = tfidf_matrix.mean(axis=0).A1
80
- top_indices = mean_tfidf.argsort()[-top_n:][::-1]
81
- keywords = [feature_names[i] for i in top_indices]
82
- return " | ".join(keywords)
83
- except:
84
- return "关键词提取失败"
85
-
86
-
87
- model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
88
- # 从本地加载模型
89
- MODEL = SentenceTransformer(model_path)
90
-
91
- def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
92
- """
93
- 使用 BERT 编码句子,并支持本地缓存
94
- """
95
- # if os.path.exists(cache_path):
96
- # print(f"已找到缓存文件:{cache_path},正在加载向量...")
97
- # with open(cache_path, 'rb') as f:
98
- # sentence_vectors = pickle.load(f)
99
- # else:
100
- # print("未找到缓存,开始编码...")
101
- sentence_vectors = model.encode(
102
- sentences,
103
- batch_size=16,
104
- show_progress_bar=True,
105
- convert_to_tensor=False
106
- )
107
- # print(f"编码完成,保存到:{cache_path}")
108
- # with open(cache_path, 'wb') as f:
109
- # pickle.dump(sentence_vectors, f)
110
-
111
- return sentence_vectors
112
-
113
-
114
- def auto_select_k(embeddings, max_k=10):
115
- """自动选择最佳聚类数(轮廓系数最高)"""
116
- sil_scores = []
117
- k_range = range(2, min(max_k + 1, len(embeddings) // 2))
118
- for k in k_range:
119
- kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
120
- labels = kmeans.fit_predict(embeddings)
121
- sil_scores.append(silhouette_score(embeddings, labels))
122
-
123
- best_k = k_range[np.argmax(sil_scores)]
124
- print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
125
- return best_k
126
-
127
-
128
- def cluster_and_visualize(
129
- excel_path: str,
130
- questions=['s1', 's2', 's3', 's4'],
131
- max_k=15
132
- ):
133
- """
134
- 输入:Excel + 句向量 pkl
135
- 输出:(聚类图 base64, 统计信息 dict)
136
- """
137
- # 1. 加载数据
138
- df = pd.read_excel(excel_path)
139
- sentences = []
140
- meta = []
141
- for idx, row in df.iterrows():
142
- for q in questions:
143
- text = str(row[q]).strip()
144
- if text:
145
- sentences.append(text)
146
- meta.append((row['no'], q))
147
- emb = encode_sentences_with_cache(sentences, MODEL)
148
- # with open(pkl_path, 'rb') as f:
149
- # emb = pickle.load(f)
150
-
151
- # 2. 自动选 k
152
- n_clusters = auto_select_k(emb, max_k=max_k)
153
- # n_clusters = 8
154
-
155
- # 3. 聚类
156
- kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
157
- labels = kmeans.labels_
158
- closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
159
-
160
- # 在聚类后,替换人工关键词
161
- cluster_keywords_auto = []
162
- for i in range(n_clusters):
163
- kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
164
- cluster_keywords_auto.append(kw)
165
-
166
- # 4. 统计
167
- stats = []
168
- total = len(sentences)
169
- for i in range(n_clusters):
170
- cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
171
- size = len(cluster_sents)
172
- rep_sent = sentences[closest[i]]
173
- stats.append({
174
- 'cluster_id': i,
175
- 'size': size,
176
- 'ratio': size / total,
177
- 'rep_sentence': rep_sent,
178
- 'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
179
- })
180
-
181
- # 5. 可视化
182
- tsne = TSNE(n_components=2, random_state=42)
183
- emb_2d = tsne.fit_transform(emb)
184
-
185
- cmap = cm.get_cmap('rainbow', n_clusters)
186
- cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
187
- point_colors = [cluster_colors[l] for l in labels]
188
-
189
- fig = go.Figure()
190
- # 数据点
191
- fig.add_trace(go.Scatter(
192
- x=emb_2d[:, 0], y=emb_2d[:, 1],
193
- mode='markers',
194
- marker=dict(size=10, color=point_colors, opacity=0.7),
195
- text=[f"聚类 {l}" for l in labels],
196
- hoverinfo='text',
197
- showlegend=False
198
- ))
199
- # 聚类中心
200
- center_x = emb_2d[closest, 0]
201
- center_y = emb_2d[closest, 1]
202
- for i, (x, y) in enumerate(zip(center_x, center_y)):
203
- keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
204
- fig.add_trace(go.Scatter(
205
- x=[x], y=[y],
206
- mode='markers+text',
207
- marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
208
- text=[keyword],
209
- textposition="top center",
210
- textfont=dict(family="SimHei", size=20, color='black'),
211
- showlegend=False
212
- ))
213
-
214
- fig.update_layout(
215
- title="EGISInsight:学生反馈聚类洞察",
216
- font=dict(family="Microsoft YaHei", size=18),
217
- width=900, height=600,
218
- plot_bgcolor='#F5F5F5'
219
- )
220
-
221
- img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
222
- b64 = base64.b64encode(img_bytes).decode()
223
-
224
- return b64, stats
225
-
 
1
+ '''
2
+ 基于向量聚类方式的可视化模块
3
+ '''
4
+ # cluster_insight.py
5
+ import pandas as pd
6
+ import numpy as np
7
+ from sklearn.cluster import KMeans
8
+ from sklearn.metrics import silhouette_score
9
+ from sklearn.manifold import TSNE
10
+ import plotly.graph_objects as go
11
+ import matplotlib.cm as cm
12
+ import matplotlib.colors as mcolors
13
+ # from wordcloud import WordCloud
14
+ import matplotlib.pyplot as plt
15
+ import base64
16
+ from io import BytesIO
17
+ from sentence_transformers import SentenceTransformer, util
18
+ import os
19
+ import pickle
20
+ # cluster_insight.py → 新增函数
21
+ from sklearn.feature_extraction.text import TfidfVectorizer
22
+ import jieba
23
+
24
+
25
+ def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
26
+ """
27
+ 自动提取聚类关键词
28
+ :param sentences: 所有句子
29
+ :param labels: 聚类标签
30
+ :param cluster_id: 当前聚类
31
+ :param top_n: 提取前 n 个词
32
+ :return: 关键词字符串
33
+ """
34
+ # 1. 提取该聚类所有句子
35
+ cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
36
+ if not cluster_texts:
37
+ return "无数据"
38
+
39
+ # 2. 分词(保护领域词)
40
+ DOMAIN_SET = {
41
+ # 中文通用领域词(去重 + 合并)
42
+ "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
43
+ "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
44
+ "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
45
+ "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
46
+ "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
47
+ "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
48
+
49
+ # 英文通用领域词(去重)
50
+ "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
51
+ "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
52
+ "spatial join", "data manager", "POI",
53
+ }
54
+ for word in DOMAIN_SET:
55
+ jieba.add_word(word, freq=10000)
56
+ # 1. 基础停用词(通用功能词)
57
+ STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
58
+ "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
59
+ "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
60
+ "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
61
+ "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
62
+ }
63
+ def tokenize(text):
64
+ words = jieba.lcut(text)
65
+ return [
66
+ w for w in words
67
+ if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
68
+ ]
69
+
70
+ tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
71
+
72
+ # 3. TF-IDF 提取关键词
73
+ vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
74
+ try:
75
+ tfidf_matrix = vectorizer.fit_transform(tokenized)
76
+ feature_names = vectorizer.get_feature_names_out()
77
+
78
+ # 取平均 TF-IDF 最高的词
79
+ mean_tfidf = tfidf_matrix.mean(axis=0).A1
80
+ top_indices = mean_tfidf.argsort()[-top_n:][::-1]
81
+ keywords = [feature_names[i] for i in top_indices]
82
+ return " | ".join(keywords)
83
+ except:
84
+ return "关键词提取失败"
85
+
86
+
87
+ model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
88
+ # 从本地加载模型
89
+ MODEL = SentenceTransformer(model_path)
90
+
91
+ def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
92
+ """
93
+ 使用 BERT 编码句子,并支持本地缓存
94
+ """
95
+ # if os.path.exists(cache_path):
96
+ # print(f"已找到缓存文件:{cache_path},正在加载向量...")
97
+ # with open(cache_path, 'rb') as f:
98
+ # sentence_vectors = pickle.load(f)
99
+ # else:
100
+ # print("未找到缓存,开始编码...")
101
+ sentence_vectors = model.encode(
102
+ sentences,
103
+ batch_size=16,
104
+ show_progress_bar=True,
105
+ convert_to_tensor=False
106
+ )
107
+ # print(f"编码完成,保存到:{cache_path}")
108
+ # with open(cache_path, 'wb') as f:
109
+ # pickle.dump(sentence_vectors, f)
110
+
111
+ return sentence_vectors
112
+
113
+
114
+ def auto_select_k(embeddings, max_k=10):
115
+ """自动选择最佳聚类数(轮廓系数最高)"""
116
+ sil_scores = []
117
+ k_range = range(2, min(max_k + 1, len(embeddings) // 2))
118
+ for k in k_range:
119
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
120
+ labels = kmeans.fit_predict(embeddings)
121
+ sil_scores.append(silhouette_score(embeddings, labels))
122
+
123
+ best_k = k_range[np.argmax(sil_scores)]
124
+ print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
125
+ return best_k
126
+
127
+
128
+ def cluster_and_visualize(
129
+ excel_path: str,
130
+ questions=['s1', 's2', 's3', 's4'],
131
+ max_k=15
132
+ ):
133
+ """
134
+ 输入:Excel + 句向量 pkl
135
+ 输出:(聚类图 base64, 统计信息 dict)
136
+ """
137
+ # 1. 加载数据
138
+ df = pd.read_excel(excel_path)
139
+ sentences = []
140
+ meta = []
141
+ for idx, row in df.iterrows():
142
+ for q in questions:
143
+ text = str(row[q]).strip()
144
+ if text:
145
+ sentences.append(text)
146
+ meta.append((row['no'], q))
147
+ emb = encode_sentences_with_cache(sentences, MODEL)
148
+ # with open(pkl_path, 'rb') as f:
149
+ # emb = pickle.load(f)
150
+
151
+ # 2. 自动选 k
152
+ n_clusters = auto_select_k(emb, max_k=max_k)
153
+ # n_clusters = 8
154
+
155
+ # 3. 聚类
156
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
157
+ labels = kmeans.labels_
158
+ closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
159
+
160
+ # 在聚类后,替换人工关键词
161
+ cluster_keywords_auto = []
162
+ for i in range(n_clusters):
163
+ kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
164
+ cluster_keywords_auto.append(kw)
165
+
166
+ # 4. 统计
167
+ stats = []
168
+ total = len(sentences)
169
+ for i in range(n_clusters):
170
+ cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
171
+ size = len(cluster_sents)
172
+ rep_sent = sentences[closest[i]]
173
+ stats.append({
174
+ 'cluster_id': i,
175
+ 'size': size,
176
+ 'ratio': size / total,
177
+ 'rep_sentence': rep_sent,
178
+ 'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
179
+ })
180
+
181
+ # 5. 可视化
182
+ tsne = TSNE(n_components=2, random_state=42)
183
+ emb_2d = tsne.fit_transform(emb)
184
+
185
+ cmap = cm.get_cmap('rainbow', n_clusters)
186
+ cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
187
+ point_colors = [cluster_colors[l] for l in labels]
188
+
189
+ fig = go.Figure()
190
+ # 数据点
191
+ fig.add_trace(go.Scatter(
192
+ x=emb_2d[:, 0], y=emb_2d[:, 1],
193
+ mode='markers',
194
+ marker=dict(size=10, color=point_colors, opacity=0.7),
195
+ text=[f"聚类 {l}" for l in labels],
196
+ hoverinfo='text',
197
+ showlegend=False
198
+ ))
199
+ # 聚类中心
200
+ center_x = emb_2d[closest, 0]
201
+ center_y = emb_2d[closest, 1]
202
+ for i, (x, y) in enumerate(zip(center_x, center_y)):
203
+ keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
204
+ fig.add_trace(go.Scatter(
205
+ x=[x], y=[y],
206
+ mode='markers+text',
207
+ marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
208
+ text=[keyword],
209
+ textposition="top center",
210
+ textfont=dict(family="SimHei", size=20, color='black'),
211
+ showlegend=False
212
+ ))
213
+
214
+ fig.update_layout(
215
+ title="EGISInsight:学生反馈聚类洞察",
216
+ font=dict(family="Microsoft YaHei", size=18),
217
+ width=900, height=600,
218
+ plot_bgcolor='#F5F5F5'
219
+ )
220
+
221
+ img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
222
+ b64 = base64.b64encode(img_bytes).decode()
223
+
224
+ return b64, stats
225
+