File size: 10,340 Bytes
ba441c8
 
 
 
cd9e4db
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fadd831
1bd5bd5
 
 
 
 
 
 
d4945b3
453450e
1bd5bd5
 
d4945b3
 
 
 
fadd831
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cec7bc
ba441c8
9cec7bc
 
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1be2b0d
 
 
 
ba441c8
 
 
 
 
 
 
 
 
 
 
 
 
0469fea
ba441c8
 
 
 
 
1be2b0d
 
0469fea
1be2b0d
 
ba441c8
a7388ef
139b2fd
 
ba441c8
 
1bd5bd5
 
 
 
 
 
 
 
 
fadd831
ca0a5c0
beccc1b
73b633b
ba441c8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
'''
基于向量聚类方式的可视化模块
'''
# cluster_insight.py


import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.colors as mcolors
# from wordcloud import WordCloud
import matplotlib.pyplot as plt
import base64
from io import BytesIO
from sentence_transformers import SentenceTransformer, util
import os
import pickle
# cluster_insight.py → 新增函数
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba


# 第一步:在文件开头导入 plotly.io
import plotly.io as pio

# 第二步:设置默认引擎为 kaleido(旧版本 Plotly 也支持)
pio.kaleido.scope.default_format = "png"
print("✅ 已设置 kaleido 为默认图片引擎")

# ========== 核心配置(和之前一致) ==========
FONT_FILE_PATH = "./SourceHanSansCN-Light.otf"  # 字体文件在根目录
# CHINESE_FONT = "Source Han Sans CN Light"
CHINESE_FONT = "SimHei, Microsoft YaHei, Arial Unicode MS, sans-serif"
# ========== 关键:设置环境变量,让 Plotly/Kaleido 找到字体 ==========
os.environ["KALEIDO_FONT_SEARCH_PATH"] = os.getcwd()  # 字体搜索路径 = 当前目录
print(f"🔧 字体搜索路径:{os.getcwd()}")
print(f"🔧 字体文件是否存在:{os.path.exists(FONT_FILE_PATH)}")
    
# CHINESE_FONT = "Noto Sans SC" # 思源黑体(跨平台兼容,Plotly 自带)

def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
    """
    自动提取聚类关键词
    :param sentences: 所有句子
    :param labels: 聚类标签
    :param cluster_id: 当前聚类
    :param top_n: 提取前 n 个词
    :return: 关键词字符串
    """
    # 1. 提取该聚类所有句子
    cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
    if not cluster_texts:
        return "无数据"

    # 2. 分词(保护领域词)
    DOMAIN_SET = {
    # 中文通用领域词(去重 + 合并)
    "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
    "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
    "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
    "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
    "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
    "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",

    # 英文通用领域词(去重)
    "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
    "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
    "spatial join", "data manager", "POI",
    }
    for word in DOMAIN_SET:
        jieba.add_word(word, freq=10000)
        # 1. 基础停用词(通用功能词)
    STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
                 "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
                 "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
                 "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
                 "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
                 }
    def tokenize(text):
        words = jieba.lcut(text)
        return [
            w for w in words
            if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
        ]

    tokenized = [" ".join(tokenize(text)) for text in cluster_texts]

    # 3. TF-IDF 提取关键词
    vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
    try:
        tfidf_matrix = vectorizer.fit_transform(tokenized)
        feature_names = vectorizer.get_feature_names_out()

        # 取平均 TF-IDF 最高的词
        mean_tfidf = tfidf_matrix.mean(axis=0).A1
        top_indices = mean_tfidf.argsort()[-top_n:][::-1]
        keywords = [feature_names[i] for i in top_indices]
        return " | ".join(keywords)
    except:
        return "关键词提取失败"


# model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
# 从本地加载模型
model_dir = os.path.join("sbert", "models--shibing624--text2vec-base-chinese", "snapshots", "183bb99aa7af74355fb58d16edf8c13ae7c5433e")
# model = SentenceTransformer(model_dir)
MODEL = SentenceTransformer(model_dir)

def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
    """
    使用 BERT 编码句子,并支持本地缓存
    """
    # if os.path.exists(cache_path):
    #     print(f"已找到缓存文件:{cache_path},正在加载向量...")
    #     with open(cache_path, 'rb') as f:
    #         sentence_vectors = pickle.load(f)
    # else:
    #     print("未找到缓存,开始编码...")
    sentence_vectors = model.encode(
        sentences,
        batch_size=16,
        show_progress_bar=True,
        convert_to_tensor=False
    )
        # print(f"编码完成,保存到:{cache_path}")
        # with open(cache_path, 'wb') as f:
        #     pickle.dump(sentence_vectors, f)

    return sentence_vectors


def auto_select_k(embeddings, max_k=10):
    """自动选择最佳聚类数(轮廓系数最高)"""
    sil_scores = []
    k_range = range(2, min(max_k + 1, len(embeddings) // 2))
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(embeddings)
        sil_scores.append(silhouette_score(embeddings, labels))

    best_k = k_range[np.argmax(sil_scores)]
    print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
    return best_k


def cluster_and_visualize(
        excel_path: str,
        questions=['s1', 's2', 's3', 's4'],
        max_k=15
    ):
    """
    输入:Excel + 句向量 pkl
    输出:(聚类图 base64, 统计信息 dict)
    """
    # 1. 加载数据
    df = pd.read_excel(excel_path)
    sentences = []
    meta = []
    for idx, row in df.iterrows():
        for q in questions:
            text = str(row[q]).strip()
            if text:
                sentences.append(text)
                meta.append((row['no'], q))
    emb = encode_sentences_with_cache(sentences, MODEL)
    # with open(pkl_path, 'rb') as f:
    #     emb = pickle.load(f)

    # 2. 自动选 k
    n_clusters = auto_select_k(emb, max_k=max_k)
    # n_clusters = 8

    # 3. 聚类
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
    labels = kmeans.labels_
    closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)

    # 在聚类后,替换人工关键词
    cluster_keywords_auto = []
    for i in range(n_clusters):
        kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
        cluster_keywords_auto.append(kw)

    # 4. 统计
    stats = []
    total = len(sentences)
    for i in range(n_clusters):
        cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
        size = len(cluster_sents)
        rep_sent = sentences[closest[i]]
        stats.append({
            'cluster_id': i,
            'size': size,
            'ratio': size / total,
            'rep_sentence': rep_sent,
            'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
        })

    # 5. 可视化
    tsne = TSNE(n_components=2, random_state=42)
    emb_2d = tsne.fit_transform(emb)

    cmap = cm.get_cmap('rainbow', n_clusters)
    cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
    point_colors = [cluster_colors[l] for l in labels]

    fig = go.Figure()
    # 数据点
    fig.add_trace(go.Scatter(
        x=emb_2d[:, 0], y=emb_2d[:, 1],
        mode='markers',
        marker=dict(size=10, color=point_colors, opacity=0.7),
        text=[f"聚类 {l}" for l in labels],
        hoverinfo='text',
        hoverlabel=dict(
            font=dict(family=CHINESE_FONT, size=16),  # hover 字体适配中文
            bgcolor='white'
        ),
        showlegend=False
    ))
    # 聚类中心
    center_x = emb_2d[closest, 0]
    center_y = emb_2d[closest, 1]
    for i, (x, y) in enumerate(zip(center_x, center_y)):
        keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
        fig.add_trace(go.Scatter(
            x=[x], y=[y],
            mode='markers+text',
            marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
            text=[keyword],
            textposition="top center",
            textfont=dict(family=CHINESE_FONT, size=20, color='black'),
            showlegend=False
        ))

    fig.update_layout(
        title="EGISInsight:学生反馈聚类洞察",
        titlefont=dict(
            family=CHINESE_FONT,
            size=22
        ),
        font=dict(family=CHINESE_FONT, size=18),
        width=900, height=600,
        plot_bgcolor='#F5F5F5',
        autosize=False,   # 关闭自动缩放
        margin=dict(l=50, r=50, t=80, b=50) # 让 Plotly 内部也居中
    )

    # img_bytes = fig.to_image(format="png", width=900, height=600, scale=2 )  # 新增这一行!
    # 新代码:
    img_bytes = pio.to_image(
        fig,
        format="png",
        width=900,
        height=600,
        scale=2
        )
    b64 = base64.b64encode(img_bytes).decode('utf-8')
    # print(f"{b64}解析成功!")
    # return b64, stats
    return fig, b64, stats