Spaces:
Running
Running
Update cluster_insight.py
Browse files- cluster_insight.py +225 -225
cluster_insight.py
CHANGED
|
@@ -1,225 +1,225 @@
|
|
| 1 |
-
'''
|
| 2 |
-
基于向量聚类方式的可视化模块
|
| 3 |
-
'''
|
| 4 |
-
# cluster_insight.py
|
| 5 |
-
import pandas as pd
|
| 6 |
-
import numpy as np
|
| 7 |
-
from sklearn.cluster import KMeans
|
| 8 |
-
from sklearn.metrics import silhouette_score
|
| 9 |
-
from sklearn.manifold import TSNE
|
| 10 |
-
import plotly.graph_objects as go
|
| 11 |
-
import matplotlib.cm as cm
|
| 12 |
-
import matplotlib.colors as mcolors
|
| 13 |
-
from wordcloud import WordCloud
|
| 14 |
-
import matplotlib.pyplot as plt
|
| 15 |
-
import base64
|
| 16 |
-
from io import BytesIO
|
| 17 |
-
from sentence_transformers import SentenceTransformer, util
|
| 18 |
-
import os
|
| 19 |
-
import pickle
|
| 20 |
-
# cluster_insight.py → 新增函数
|
| 21 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 22 |
-
import jieba
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
|
| 26 |
-
"""
|
| 27 |
-
自动提取聚类关键词
|
| 28 |
-
:param sentences: 所有句子
|
| 29 |
-
:param labels: 聚类标签
|
| 30 |
-
:param cluster_id: 当前聚类
|
| 31 |
-
:param top_n: 提取前 n 个词
|
| 32 |
-
:return: 关键词字符串
|
| 33 |
-
"""
|
| 34 |
-
# 1. 提取该聚类所有句子
|
| 35 |
-
cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
|
| 36 |
-
if not cluster_texts:
|
| 37 |
-
return "无数据"
|
| 38 |
-
|
| 39 |
-
# 2. 分词(保护领域词)
|
| 40 |
-
DOMAIN_SET = {
|
| 41 |
-
# 中文通用领域词(去重 + 合并)
|
| 42 |
-
"空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
|
| 43 |
-
"数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
|
| 44 |
-
"地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
|
| 45 |
-
"核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
|
| 46 |
-
"地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
|
| 47 |
-
"参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
|
| 48 |
-
|
| 49 |
-
# 英文通用领域词(去重)
|
| 50 |
-
"ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
|
| 51 |
-
"Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
|
| 52 |
-
"spatial join", "data manager", "POI",
|
| 53 |
-
}
|
| 54 |
-
for word in DOMAIN_SET:
|
| 55 |
-
jieba.add_word(word, freq=10000)
|
| 56 |
-
# 1. 基础停用词(通用功能词)
|
| 57 |
-
STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
|
| 58 |
-
"怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
|
| 59 |
-
"一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
|
| 60 |
-
"学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
|
| 61 |
-
"建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
|
| 62 |
-
}
|
| 63 |
-
def tokenize(text):
|
| 64 |
-
words = jieba.lcut(text)
|
| 65 |
-
return [
|
| 66 |
-
w for w in words
|
| 67 |
-
if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
|
| 68 |
-
]
|
| 69 |
-
|
| 70 |
-
tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
|
| 71 |
-
|
| 72 |
-
# 3. TF-IDF 提取关键词
|
| 73 |
-
vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
|
| 74 |
-
try:
|
| 75 |
-
tfidf_matrix = vectorizer.fit_transform(tokenized)
|
| 76 |
-
feature_names = vectorizer.get_feature_names_out()
|
| 77 |
-
|
| 78 |
-
# 取平均 TF-IDF 最高的词
|
| 79 |
-
mean_tfidf = tfidf_matrix.mean(axis=0).A1
|
| 80 |
-
top_indices = mean_tfidf.argsort()[-top_n:][::-1]
|
| 81 |
-
keywords = [feature_names[i] for i in top_indices]
|
| 82 |
-
return " | ".join(keywords)
|
| 83 |
-
except:
|
| 84 |
-
return "关键词提取失败"
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
|
| 88 |
-
# 从本地加载模型
|
| 89 |
-
MODEL = SentenceTransformer(model_path)
|
| 90 |
-
|
| 91 |
-
def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
|
| 92 |
-
"""
|
| 93 |
-
使用 BERT 编码句子,并支持本地缓存
|
| 94 |
-
"""
|
| 95 |
-
# if os.path.exists(cache_path):
|
| 96 |
-
# print(f"已找到缓存文件:{cache_path},正在加载向量...")
|
| 97 |
-
# with open(cache_path, 'rb') as f:
|
| 98 |
-
# sentence_vectors = pickle.load(f)
|
| 99 |
-
# else:
|
| 100 |
-
# print("未找到缓存,开始编码...")
|
| 101 |
-
sentence_vectors = model.encode(
|
| 102 |
-
sentences,
|
| 103 |
-
batch_size=16,
|
| 104 |
-
show_progress_bar=True,
|
| 105 |
-
convert_to_tensor=False
|
| 106 |
-
)
|
| 107 |
-
# print(f"编码完成,保存到:{cache_path}")
|
| 108 |
-
# with open(cache_path, 'wb') as f:
|
| 109 |
-
# pickle.dump(sentence_vectors, f)
|
| 110 |
-
|
| 111 |
-
return sentence_vectors
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
def auto_select_k(embeddings, max_k=10):
|
| 115 |
-
"""自动选择最佳聚类数(轮廓系数最高)"""
|
| 116 |
-
sil_scores = []
|
| 117 |
-
k_range = range(2, min(max_k + 1, len(embeddings) // 2))
|
| 118 |
-
for k in k_range:
|
| 119 |
-
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 120 |
-
labels = kmeans.fit_predict(embeddings)
|
| 121 |
-
sil_scores.append(silhouette_score(embeddings, labels))
|
| 122 |
-
|
| 123 |
-
best_k = k_range[np.argmax(sil_scores)]
|
| 124 |
-
print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
|
| 125 |
-
return best_k
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
def cluster_and_visualize(
|
| 129 |
-
excel_path: str,
|
| 130 |
-
questions=['s1', 's2', 's3', 's4'],
|
| 131 |
-
max_k=15
|
| 132 |
-
):
|
| 133 |
-
"""
|
| 134 |
-
输入:Excel + 句向量 pkl
|
| 135 |
-
输出:(聚类图 base64, 统计信息 dict)
|
| 136 |
-
"""
|
| 137 |
-
# 1. 加载数据
|
| 138 |
-
df = pd.read_excel(excel_path)
|
| 139 |
-
sentences = []
|
| 140 |
-
meta = []
|
| 141 |
-
for idx, row in df.iterrows():
|
| 142 |
-
for q in questions:
|
| 143 |
-
text = str(row[q]).strip()
|
| 144 |
-
if text:
|
| 145 |
-
sentences.append(text)
|
| 146 |
-
meta.append((row['no'], q))
|
| 147 |
-
emb = encode_sentences_with_cache(sentences, MODEL)
|
| 148 |
-
# with open(pkl_path, 'rb') as f:
|
| 149 |
-
# emb = pickle.load(f)
|
| 150 |
-
|
| 151 |
-
# 2. 自动选 k
|
| 152 |
-
n_clusters = auto_select_k(emb, max_k=max_k)
|
| 153 |
-
# n_clusters = 8
|
| 154 |
-
|
| 155 |
-
# 3. 聚类
|
| 156 |
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
|
| 157 |
-
labels = kmeans.labels_
|
| 158 |
-
closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
|
| 159 |
-
|
| 160 |
-
# 在聚类后,替换人工关键词
|
| 161 |
-
cluster_keywords_auto = []
|
| 162 |
-
for i in range(n_clusters):
|
| 163 |
-
kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
|
| 164 |
-
cluster_keywords_auto.append(kw)
|
| 165 |
-
|
| 166 |
-
# 4. 统计
|
| 167 |
-
stats = []
|
| 168 |
-
total = len(sentences)
|
| 169 |
-
for i in range(n_clusters):
|
| 170 |
-
cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
|
| 171 |
-
size = len(cluster_sents)
|
| 172 |
-
rep_sent = sentences[closest[i]]
|
| 173 |
-
stats.append({
|
| 174 |
-
'cluster_id': i,
|
| 175 |
-
'size': size,
|
| 176 |
-
'ratio': size / total,
|
| 177 |
-
'rep_sentence': rep_sent,
|
| 178 |
-
'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
|
| 179 |
-
})
|
| 180 |
-
|
| 181 |
-
# 5. 可视化
|
| 182 |
-
tsne = TSNE(n_components=2, random_state=42)
|
| 183 |
-
emb_2d = tsne.fit_transform(emb)
|
| 184 |
-
|
| 185 |
-
cmap = cm.get_cmap('rainbow', n_clusters)
|
| 186 |
-
cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
|
| 187 |
-
point_colors = [cluster_colors[l] for l in labels]
|
| 188 |
-
|
| 189 |
-
fig = go.Figure()
|
| 190 |
-
# 数据点
|
| 191 |
-
fig.add_trace(go.Scatter(
|
| 192 |
-
x=emb_2d[:, 0], y=emb_2d[:, 1],
|
| 193 |
-
mode='markers',
|
| 194 |
-
marker=dict(size=10, color=point_colors, opacity=0.7),
|
| 195 |
-
text=[f"聚类 {l}" for l in labels],
|
| 196 |
-
hoverinfo='text',
|
| 197 |
-
showlegend=False
|
| 198 |
-
))
|
| 199 |
-
# 聚类中心
|
| 200 |
-
center_x = emb_2d[closest, 0]
|
| 201 |
-
center_y = emb_2d[closest, 1]
|
| 202 |
-
for i, (x, y) in enumerate(zip(center_x, center_y)):
|
| 203 |
-
keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
|
| 204 |
-
fig.add_trace(go.Scatter(
|
| 205 |
-
x=[x], y=[y],
|
| 206 |
-
mode='markers+text',
|
| 207 |
-
marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
|
| 208 |
-
text=[keyword],
|
| 209 |
-
textposition="top center",
|
| 210 |
-
textfont=dict(family="SimHei", size=20, color='black'),
|
| 211 |
-
showlegend=False
|
| 212 |
-
))
|
| 213 |
-
|
| 214 |
-
fig.update_layout(
|
| 215 |
-
title="EGISInsight:学生反馈聚类洞察",
|
| 216 |
-
font=dict(family="Microsoft YaHei", size=18),
|
| 217 |
-
width=900, height=600,
|
| 218 |
-
plot_bgcolor='#F5F5F5'
|
| 219 |
-
)
|
| 220 |
-
|
| 221 |
-
img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
|
| 222 |
-
b64 = base64.b64encode(img_bytes).decode()
|
| 223 |
-
|
| 224 |
-
return b64, stats
|
| 225 |
-
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
基于向量聚类方式的可视化模块
|
| 3 |
+
'''
|
| 4 |
+
# cluster_insight.py
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from sklearn.cluster import KMeans
|
| 8 |
+
from sklearn.metrics import silhouette_score
|
| 9 |
+
from sklearn.manifold import TSNE
|
| 10 |
+
import plotly.graph_objects as go
|
| 11 |
+
import matplotlib.cm as cm
|
| 12 |
+
import matplotlib.colors as mcolors
|
| 13 |
+
# from wordcloud import WordCloud
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
import base64
|
| 16 |
+
from io import BytesIO
|
| 17 |
+
from sentence_transformers import SentenceTransformer, util
|
| 18 |
+
import os
|
| 19 |
+
import pickle
|
| 20 |
+
# cluster_insight.py → 新增函数
|
| 21 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 22 |
+
import jieba
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
|
| 26 |
+
"""
|
| 27 |
+
自动提取聚类关键词
|
| 28 |
+
:param sentences: 所有句子
|
| 29 |
+
:param labels: 聚类标签
|
| 30 |
+
:param cluster_id: 当前聚类
|
| 31 |
+
:param top_n: 提取前 n 个词
|
| 32 |
+
:return: 关键词字符串
|
| 33 |
+
"""
|
| 34 |
+
# 1. 提取该聚类所有句子
|
| 35 |
+
cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
|
| 36 |
+
if not cluster_texts:
|
| 37 |
+
return "无数据"
|
| 38 |
+
|
| 39 |
+
# 2. 分词(保护领域词)
|
| 40 |
+
DOMAIN_SET = {
|
| 41 |
+
# 中文通用领域词(去重 + 合并)
|
| 42 |
+
"空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
|
| 43 |
+
"数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
|
| 44 |
+
"地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
|
| 45 |
+
"核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
|
| 46 |
+
"地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
|
| 47 |
+
"参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
|
| 48 |
+
|
| 49 |
+
# 英文通用领域词(去重)
|
| 50 |
+
"ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
|
| 51 |
+
"Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
|
| 52 |
+
"spatial join", "data manager", "POI",
|
| 53 |
+
}
|
| 54 |
+
for word in DOMAIN_SET:
|
| 55 |
+
jieba.add_word(word, freq=10000)
|
| 56 |
+
# 1. 基础停用词(通用功能词)
|
| 57 |
+
STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
|
| 58 |
+
"怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
|
| 59 |
+
"一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
|
| 60 |
+
"学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
|
| 61 |
+
"建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
|
| 62 |
+
}
|
| 63 |
+
def tokenize(text):
|
| 64 |
+
words = jieba.lcut(text)
|
| 65 |
+
return [
|
| 66 |
+
w for w in words
|
| 67 |
+
if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
|
| 71 |
+
|
| 72 |
+
# 3. TF-IDF 提取关键词
|
| 73 |
+
vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
|
| 74 |
+
try:
|
| 75 |
+
tfidf_matrix = vectorizer.fit_transform(tokenized)
|
| 76 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 77 |
+
|
| 78 |
+
# 取平均 TF-IDF 最高的词
|
| 79 |
+
mean_tfidf = tfidf_matrix.mean(axis=0).A1
|
| 80 |
+
top_indices = mean_tfidf.argsort()[-top_n:][::-1]
|
| 81 |
+
keywords = [feature_names[i] for i in top_indices]
|
| 82 |
+
return " | ".join(keywords)
|
| 83 |
+
except:
|
| 84 |
+
return "关键词提取失败"
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
|
| 88 |
+
# 从本地加载模型
|
| 89 |
+
MODEL = SentenceTransformer(model_path)
|
| 90 |
+
|
| 91 |
+
def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
|
| 92 |
+
"""
|
| 93 |
+
使用 BERT 编码句子,并支持本地缓存
|
| 94 |
+
"""
|
| 95 |
+
# if os.path.exists(cache_path):
|
| 96 |
+
# print(f"已找到缓存文件:{cache_path},正在加载向量...")
|
| 97 |
+
# with open(cache_path, 'rb') as f:
|
| 98 |
+
# sentence_vectors = pickle.load(f)
|
| 99 |
+
# else:
|
| 100 |
+
# print("未找到缓存,开始编码...")
|
| 101 |
+
sentence_vectors = model.encode(
|
| 102 |
+
sentences,
|
| 103 |
+
batch_size=16,
|
| 104 |
+
show_progress_bar=True,
|
| 105 |
+
convert_to_tensor=False
|
| 106 |
+
)
|
| 107 |
+
# print(f"编码完成,保存到:{cache_path}")
|
| 108 |
+
# with open(cache_path, 'wb') as f:
|
| 109 |
+
# pickle.dump(sentence_vectors, f)
|
| 110 |
+
|
| 111 |
+
return sentence_vectors
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def auto_select_k(embeddings, max_k=10):
|
| 115 |
+
"""自动选择最佳聚类数(轮廓系数最高)"""
|
| 116 |
+
sil_scores = []
|
| 117 |
+
k_range = range(2, min(max_k + 1, len(embeddings) // 2))
|
| 118 |
+
for k in k_range:
|
| 119 |
+
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 120 |
+
labels = kmeans.fit_predict(embeddings)
|
| 121 |
+
sil_scores.append(silhouette_score(embeddings, labels))
|
| 122 |
+
|
| 123 |
+
best_k = k_range[np.argmax(sil_scores)]
|
| 124 |
+
print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
|
| 125 |
+
return best_k
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def cluster_and_visualize(
|
| 129 |
+
excel_path: str,
|
| 130 |
+
questions=['s1', 's2', 's3', 's4'],
|
| 131 |
+
max_k=15
|
| 132 |
+
):
|
| 133 |
+
"""
|
| 134 |
+
输入:Excel + 句向量 pkl
|
| 135 |
+
输出:(聚类图 base64, 统计信息 dict)
|
| 136 |
+
"""
|
| 137 |
+
# 1. 加载数据
|
| 138 |
+
df = pd.read_excel(excel_path)
|
| 139 |
+
sentences = []
|
| 140 |
+
meta = []
|
| 141 |
+
for idx, row in df.iterrows():
|
| 142 |
+
for q in questions:
|
| 143 |
+
text = str(row[q]).strip()
|
| 144 |
+
if text:
|
| 145 |
+
sentences.append(text)
|
| 146 |
+
meta.append((row['no'], q))
|
| 147 |
+
emb = encode_sentences_with_cache(sentences, MODEL)
|
| 148 |
+
# with open(pkl_path, 'rb') as f:
|
| 149 |
+
# emb = pickle.load(f)
|
| 150 |
+
|
| 151 |
+
# 2. 自动选 k
|
| 152 |
+
n_clusters = auto_select_k(emb, max_k=max_k)
|
| 153 |
+
# n_clusters = 8
|
| 154 |
+
|
| 155 |
+
# 3. 聚类
|
| 156 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
|
| 157 |
+
labels = kmeans.labels_
|
| 158 |
+
closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
|
| 159 |
+
|
| 160 |
+
# 在聚类后,替换人工关键词
|
| 161 |
+
cluster_keywords_auto = []
|
| 162 |
+
for i in range(n_clusters):
|
| 163 |
+
kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
|
| 164 |
+
cluster_keywords_auto.append(kw)
|
| 165 |
+
|
| 166 |
+
# 4. 统计
|
| 167 |
+
stats = []
|
| 168 |
+
total = len(sentences)
|
| 169 |
+
for i in range(n_clusters):
|
| 170 |
+
cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
|
| 171 |
+
size = len(cluster_sents)
|
| 172 |
+
rep_sent = sentences[closest[i]]
|
| 173 |
+
stats.append({
|
| 174 |
+
'cluster_id': i,
|
| 175 |
+
'size': size,
|
| 176 |
+
'ratio': size / total,
|
| 177 |
+
'rep_sentence': rep_sent,
|
| 178 |
+
'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
|
| 179 |
+
})
|
| 180 |
+
|
| 181 |
+
# 5. 可视化
|
| 182 |
+
tsne = TSNE(n_components=2, random_state=42)
|
| 183 |
+
emb_2d = tsne.fit_transform(emb)
|
| 184 |
+
|
| 185 |
+
cmap = cm.get_cmap('rainbow', n_clusters)
|
| 186 |
+
cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
|
| 187 |
+
point_colors = [cluster_colors[l] for l in labels]
|
| 188 |
+
|
| 189 |
+
fig = go.Figure()
|
| 190 |
+
# 数据点
|
| 191 |
+
fig.add_trace(go.Scatter(
|
| 192 |
+
x=emb_2d[:, 0], y=emb_2d[:, 1],
|
| 193 |
+
mode='markers',
|
| 194 |
+
marker=dict(size=10, color=point_colors, opacity=0.7),
|
| 195 |
+
text=[f"聚类 {l}" for l in labels],
|
| 196 |
+
hoverinfo='text',
|
| 197 |
+
showlegend=False
|
| 198 |
+
))
|
| 199 |
+
# 聚类中心
|
| 200 |
+
center_x = emb_2d[closest, 0]
|
| 201 |
+
center_y = emb_2d[closest, 1]
|
| 202 |
+
for i, (x, y) in enumerate(zip(center_x, center_y)):
|
| 203 |
+
keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
|
| 204 |
+
fig.add_trace(go.Scatter(
|
| 205 |
+
x=[x], y=[y],
|
| 206 |
+
mode='markers+text',
|
| 207 |
+
marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
|
| 208 |
+
text=[keyword],
|
| 209 |
+
textposition="top center",
|
| 210 |
+
textfont=dict(family="SimHei", size=20, color='black'),
|
| 211 |
+
showlegend=False
|
| 212 |
+
))
|
| 213 |
+
|
| 214 |
+
fig.update_layout(
|
| 215 |
+
title="EGISInsight:学生反馈聚类洞察",
|
| 216 |
+
font=dict(family="Microsoft YaHei", size=18),
|
| 217 |
+
width=900, height=600,
|
| 218 |
+
plot_bgcolor='#F5F5F5'
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
|
| 222 |
+
b64 = base64.b64encode(img_bytes).decode()
|
| 223 |
+
|
| 224 |
+
return b64, stats
|
| 225 |
+
|