Spaces:
Running
Running
File size: 10,340 Bytes
ba441c8 cd9e4db ba441c8 fadd831 1bd5bd5 d4945b3 453450e 1bd5bd5 d4945b3 fadd831 ba441c8 9cec7bc ba441c8 9cec7bc ba441c8 1be2b0d ba441c8 0469fea ba441c8 1be2b0d 0469fea 1be2b0d ba441c8 a7388ef 139b2fd ba441c8 1bd5bd5 fadd831 ca0a5c0 beccc1b 73b633b ba441c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 |
'''
基于向量聚类方式的可视化模块
'''
# cluster_insight.py
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.colors as mcolors
# from wordcloud import WordCloud
import matplotlib.pyplot as plt
import base64
from io import BytesIO
from sentence_transformers import SentenceTransformer, util
import os
import pickle
# cluster_insight.py → 新增函数
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
# 第一步:在文件开头导入 plotly.io
import plotly.io as pio
# 第二步:设置默认引擎为 kaleido(旧版本 Plotly 也支持)
pio.kaleido.scope.default_format = "png"
print("✅ 已设置 kaleido 为默认图片引擎")
# ========== 核心配置(和之前一致) ==========
FONT_FILE_PATH = "./SourceHanSansCN-Light.otf" # 字体文件在根目录
# CHINESE_FONT = "Source Han Sans CN Light"
CHINESE_FONT = "SimHei, Microsoft YaHei, Arial Unicode MS, sans-serif"
# ========== 关键:设置环境变量,让 Plotly/Kaleido 找到字体 ==========
os.environ["KALEIDO_FONT_SEARCH_PATH"] = os.getcwd() # 字体搜索路径 = 当前目录
print(f"🔧 字体搜索路径:{os.getcwd()}")
print(f"🔧 字体文件是否存在:{os.path.exists(FONT_FILE_PATH)}")
# CHINESE_FONT = "Noto Sans SC" # 思源黑体(跨平台兼容,Plotly 自带)
def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
"""
自动提取聚类关键词
:param sentences: 所有句子
:param labels: 聚类标签
:param cluster_id: 当前聚类
:param top_n: 提取前 n 个词
:return: 关键词字符串
"""
# 1. 提取该聚类所有句子
cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
if not cluster_texts:
return "无数据"
# 2. 分词(保护领域词)
DOMAIN_SET = {
# 中文通用领域词(去重 + 合并)
"空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
"数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
"地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
"核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
"地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
"参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
# 英文通用领域词(去重)
"ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
"Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
"spatial join", "data manager", "POI",
}
for word in DOMAIN_SET:
jieba.add_word(word, freq=10000)
# 1. 基础停用词(通用功能词)
STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
"怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
"一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
"学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
"建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
}
def tokenize(text):
words = jieba.lcut(text)
return [
w for w in words
if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
]
tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
# 3. TF-IDF 提取关键词
vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
try:
tfidf_matrix = vectorizer.fit_transform(tokenized)
feature_names = vectorizer.get_feature_names_out()
# 取平均 TF-IDF 最高的词
mean_tfidf = tfidf_matrix.mean(axis=0).A1
top_indices = mean_tfidf.argsort()[-top_n:][::-1]
keywords = [feature_names[i] for i in top_indices]
return " | ".join(keywords)
except:
return "关键词提取失败"
# model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
# 从本地加载模型
model_dir = os.path.join("sbert", "models--shibing624--text2vec-base-chinese", "snapshots", "183bb99aa7af74355fb58d16edf8c13ae7c5433e")
# model = SentenceTransformer(model_dir)
MODEL = SentenceTransformer(model_dir)
def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
"""
使用 BERT 编码句子,并支持本地缓存
"""
# if os.path.exists(cache_path):
# print(f"已找到缓存文件:{cache_path},正在加载向量...")
# with open(cache_path, 'rb') as f:
# sentence_vectors = pickle.load(f)
# else:
# print("未找到缓存,开始编码...")
sentence_vectors = model.encode(
sentences,
batch_size=16,
show_progress_bar=True,
convert_to_tensor=False
)
# print(f"编码完成,保存到:{cache_path}")
# with open(cache_path, 'wb') as f:
# pickle.dump(sentence_vectors, f)
return sentence_vectors
def auto_select_k(embeddings, max_k=10):
"""自动选择最佳聚类数(轮廓系数最高)"""
sil_scores = []
k_range = range(2, min(max_k + 1, len(embeddings) // 2))
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(embeddings)
sil_scores.append(silhouette_score(embeddings, labels))
best_k = k_range[np.argmax(sil_scores)]
print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
return best_k
def cluster_and_visualize(
excel_path: str,
questions=['s1', 's2', 's3', 's4'],
max_k=15
):
"""
输入:Excel + 句向量 pkl
输出:(聚类图 base64, 统计信息 dict)
"""
# 1. 加载数据
df = pd.read_excel(excel_path)
sentences = []
meta = []
for idx, row in df.iterrows():
for q in questions:
text = str(row[q]).strip()
if text:
sentences.append(text)
meta.append((row['no'], q))
emb = encode_sentences_with_cache(sentences, MODEL)
# with open(pkl_path, 'rb') as f:
# emb = pickle.load(f)
# 2. 自动选 k
n_clusters = auto_select_k(emb, max_k=max_k)
# n_clusters = 8
# 3. 聚类
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
labels = kmeans.labels_
closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
# 在聚类后,替换人工关键词
cluster_keywords_auto = []
for i in range(n_clusters):
kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
cluster_keywords_auto.append(kw)
# 4. 统计
stats = []
total = len(sentences)
for i in range(n_clusters):
cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
size = len(cluster_sents)
rep_sent = sentences[closest[i]]
stats.append({
'cluster_id': i,
'size': size,
'ratio': size / total,
'rep_sentence': rep_sent,
'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
})
# 5. 可视化
tsne = TSNE(n_components=2, random_state=42)
emb_2d = tsne.fit_transform(emb)
cmap = cm.get_cmap('rainbow', n_clusters)
cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
point_colors = [cluster_colors[l] for l in labels]
fig = go.Figure()
# 数据点
fig.add_trace(go.Scatter(
x=emb_2d[:, 0], y=emb_2d[:, 1],
mode='markers',
marker=dict(size=10, color=point_colors, opacity=0.7),
text=[f"聚类 {l}" for l in labels],
hoverinfo='text',
hoverlabel=dict(
font=dict(family=CHINESE_FONT, size=16), # hover 字体适配中文
bgcolor='white'
),
showlegend=False
))
# 聚类中心
center_x = emb_2d[closest, 0]
center_y = emb_2d[closest, 1]
for i, (x, y) in enumerate(zip(center_x, center_y)):
keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
fig.add_trace(go.Scatter(
x=[x], y=[y],
mode='markers+text',
marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
text=[keyword],
textposition="top center",
textfont=dict(family=CHINESE_FONT, size=20, color='black'),
showlegend=False
))
fig.update_layout(
title="EGISInsight:学生反馈聚类洞察",
titlefont=dict(
family=CHINESE_FONT,
size=22
),
font=dict(family=CHINESE_FONT, size=18),
width=900, height=600,
plot_bgcolor='#F5F5F5',
autosize=False, # 关闭自动缩放
margin=dict(l=50, r=50, t=80, b=50) # 让 Plotly 内部也居中
)
# img_bytes = fig.to_image(format="png", width=900, height=600, scale=2 ) # 新增这一行!
# 新代码:
img_bytes = pio.to_image(
fig,
format="png",
width=900,
height=600,
scale=2
)
b64 = base64.b64encode(img_bytes).decode('utf-8')
# print(f"{b64}解析成功!")
# return b64, stats
return fig, b64, stats
|