Spaces:

wxy01giser
/

egisinsight

Running

App Files Files Community

wxy01giser commited on about 1 month ago

Commit

ba441c8

verified ·

1 Parent(s): 53c9ed9

Update cluster_insight.py

Browse files

Files changed (1) hide show

cluster_insight.py +225 -225

cluster_insight.py CHANGED Viewed

@@ -1,225 +1,225 @@
-'''
-基于向量聚类方式的可视化模块
-'''
-# cluster_insight.py
-import pandas as pd
-import numpy as np
-from sklearn.cluster import KMeans
-from sklearn.metrics import silhouette_score
-from sklearn.manifold import TSNE
-import plotly.graph_objects as go
-import matplotlib.cm as cm
-import matplotlib.colors as mcolors
-from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-import base64
-from io import BytesIO
-from sentence_transformers import SentenceTransformer, util
-import os
-import pickle
-# cluster_insight.py → 新增函数
-from sklearn.feature_extraction.text import TfidfVectorizer
-import jieba
-def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
-    """
-    自动提取聚类关键词
-    :param sentences: 所有句子
-    :param labels: 聚类标签
-    :param cluster_id: 当前聚类
-    :param top_n: 提取前 n 个词
-    :return: 关键词字符串
-    """
-    # 1. 提取该聚类所有句子
-    cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
-    if not cluster_texts:
-        return "无数据"
-    # 2. 分词（保护领域词）
-    DOMAIN_SET = {
-    # 中文通用领域词（去重 + 合并）
-    "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
-    "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
-    "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
-    "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
-    "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
-    "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
-    # 英文通用领域词（去重）
-    "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
-    "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
-    "spatial join", "data manager", "POI",
-    }
-    for word in DOMAIN_SET:
-        jieba.add_word(word, freq=10000)
-        # 1. 基础停用词（通用功能词）
-    STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
-                 "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
-                 "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
-                 "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
-                 "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
-                 }
-    def tokenize(text):
-        words = jieba.lcut(text)
-        return [
-            w for w in words
-            if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
-        ]
-    tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
-    # 3. TF-IDF 提取关键词
-    vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
-    try:
-        tfidf_matrix = vectorizer.fit_transform(tokenized)
-        feature_names = vectorizer.get_feature_names_out()
-        # 取平均 TF-IDF 最高的词
-        mean_tfidf = tfidf_matrix.mean(axis=0).A1
-        top_indices = mean_tfidf.argsort()[-top_n:][::-1]
-        keywords = [feature_names[i] for i in top_indices]
-        return " | ".join(keywords)
-    except:
-        return "关键词提取失败"
-model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
-# 从本地加载模型
-MODEL = SentenceTransformer(model_path)
-def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
-    """
-    使用 BERT 编码句子，并支持本地缓存
-    """
-    # if os.path.exists(cache_path):
-    #     print(f"已找到缓存文件：{cache_path}，正在加载向量...")
-    #     with open(cache_path, 'rb') as f:
-    #         sentence_vectors = pickle.load(f)
-    # else:
-    #     print("未找到缓存，开始编码...")
-    sentence_vectors = model.encode(
-        sentences,
-        batch_size=16,
-        show_progress_bar=True,
-        convert_to_tensor=False
-    )
-        # print(f"编码完成，保存到：{cache_path}")
-        # with open(cache_path, 'wb') as f:
-        #     pickle.dump(sentence_vectors, f)
-    return sentence_vectors
-def auto_select_k(embeddings, max_k=10):
-    """自动选择最佳聚类数（轮廓系数最高）"""
-    sil_scores = []
-    k_range = range(2, min(max_k + 1, len(embeddings) // 2))
-    for k in k_range:
-        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
-        labels = kmeans.fit_predict(embeddings)
-        sil_scores.append(silhouette_score(embeddings, labels))
-    best_k = k_range[np.argmax(sil_scores)]
-    print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
-    return best_k
-def cluster_and_visualize(
-        excel_path: str,
-        questions=['s1', 's2', 's3', 's4'],
-        max_k=15
-    ):
-    """
-    输入：Excel + 句向量 pkl
-    输出：(聚类图 base64, 统计信息 dict)
-    """
-    # 1. 加载数据
-    df = pd.read_excel(excel_path)
-    sentences = []
-    meta = []
-    for idx, row in df.iterrows():
-        for q in questions:
-            text = str(row[q]).strip()
-            if text:
-                sentences.append(text)
-                meta.append((row['no'], q))
-    emb = encode_sentences_with_cache(sentences, MODEL)
-    # with open(pkl_path, 'rb') as f:
-    #     emb = pickle.load(f)
-    # 2. 自动选 k
-    n_clusters = auto_select_k(emb, max_k=max_k)
-    # n_clusters = 8
-    # 3. 聚类
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
-    labels = kmeans.labels_
-    closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
-    # 在聚类后，替换人工关键词
-    cluster_keywords_auto = []
-    for i in range(n_clusters):
-        kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
-        cluster_keywords_auto.append(kw)
-    # 4. 统计
-    stats = []
-    total = len(sentences)
-    for i in range(n_clusters):
-        cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
-        size = len(cluster_sents)
-        rep_sent = sentences[closest[i]]
-        stats.append({
-            'cluster_id': i,
-            'size': size,
-            'ratio': size / total,
-            'rep_sentence': rep_sent,
-            'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
-        })
-    # 5. 可视化
-    tsne = TSNE(n_components=2, random_state=42)
-    emb_2d = tsne.fit_transform(emb)
-    cmap = cm.get_cmap('rainbow', n_clusters)
-    cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
-    point_colors = [cluster_colors[l] for l in labels]
-    fig = go.Figure()
-    # 数据点
-    fig.add_trace(go.Scatter(
-        x=emb_2d[:, 0], y=emb_2d[:, 1],
-        mode='markers',
-        marker=dict(size=10, color=point_colors, opacity=0.7),
-        text=[f"聚类 {l}" for l in labels],
-        hoverinfo='text',
-        showlegend=False
-    ))
-    # 聚类中心
-    center_x = emb_2d[closest, 0]
-    center_y = emb_2d[closest, 1]
-    for i, (x, y) in enumerate(zip(center_x, center_y)):
-        keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
-        fig.add_trace(go.Scatter(
-            x=[x], y=[y],
-            mode='markers+text',
-            marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
-            text=[keyword],
-            textposition="top center",
-            textfont=dict(family="SimHei", size=20, color='black'),
-            showlegend=False
-        ))
-    fig.update_layout(
-        title="EGISInsight：学生反馈聚类洞察",
-        font=dict(family="Microsoft YaHei", size=18),
-        width=900, height=600,
-        plot_bgcolor='#F5F5F5'
-    )
-    img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
-    b64 = base64.b64encode(img_bytes).decode()
-    return b64, stats

+'''
+基于向量聚类方式的可视化模块
+'''
+# cluster_insight.py
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from sklearn.manifold import TSNE
+import plotly.graph_objects as go
+import matplotlib.cm as cm
+import matplotlib.colors as mcolors
+# from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import base64
+from io import BytesIO
+from sentence_transformers import SentenceTransformer, util
+import os
+import pickle
+# cluster_insight.py → 新增函数
+from sklearn.feature_extraction.text import TfidfVectorizer
+import jieba
+def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
+    """
+    自动提取聚类关键词
+    :param sentences: 所有句子
+    :param labels: 聚类标签
+    :param cluster_id: 当前聚类
+    :param top_n: 提取前 n 个词
+    :return: 关键词字符串
+    """
+    # 1. 提取该聚类所有句子
+    cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
+    if not cluster_texts:
+        return "无数据"
+    # 2. 分词（保护领域词）
+    DOMAIN_SET = {
+    # 中文通用领域词（去重 + 合并）
+    "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
+    "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
+    "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
+    "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
+    "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
+    "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
+    # 英文通用领域词（去重）
+    "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
+    "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
+    "spatial join", "data manager", "POI",
+    }
+    for word in DOMAIN_SET:
+        jieba.add_word(word, freq=10000)
+        # 1. 基础停用词（通用功能词）
+    STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
+                 "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
+                 "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
+                 "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
+                 "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
+                 }
+    def tokenize(text):
+        words = jieba.lcut(text)
+        return [
+            w for w in words
+            if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
+        ]
+    tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
+    # 3. TF-IDF 提取关键词
+    vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
+    try:
+        tfidf_matrix = vectorizer.fit_transform(tokenized)
+        feature_names = vectorizer.get_feature_names_out()
+        # 取平均 TF-IDF 最高的词
+        mean_tfidf = tfidf_matrix.mean(axis=0).A1
+        top_indices = mean_tfidf.argsort()[-top_n:][::-1]
+        keywords = [feature_names[i] for i in top_indices]
+        return " | ".join(keywords)
+    except:
+        return "关键词提取失败"
+model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
+# 从本地加载模型
+MODEL = SentenceTransformer(model_path)
+def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
+    """
+    使用 BERT 编码句子，并支持本地缓存
+    """
+    # if os.path.exists(cache_path):
+    #     print(f"已找到缓存文件：{cache_path}，正在加载向量...")
+    #     with open(cache_path, 'rb') as f:
+    #         sentence_vectors = pickle.load(f)
+    # else:
+    #     print("未找到缓存，开始编码...")
+    sentence_vectors = model.encode(
+        sentences,
+        batch_size=16,
+        show_progress_bar=True,
+        convert_to_tensor=False
+    )
+        # print(f"编码完成，保存到：{cache_path}")
+        # with open(cache_path, 'wb') as f:
+        #     pickle.dump(sentence_vectors, f)
+    return sentence_vectors
+def auto_select_k(embeddings, max_k=10):
+    """自动选择最佳聚类数（轮廓系数最高）"""
+    sil_scores = []
+    k_range = range(2, min(max_k + 1, len(embeddings) // 2))
+    for k in k_range:
+        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+        labels = kmeans.fit_predict(embeddings)
+        sil_scores.append(silhouette_score(embeddings, labels))
+    best_k = k_range[np.argmax(sil_scores)]
+    print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
+    return best_k
+def cluster_and_visualize(
+        excel_path: str,
+        questions=['s1', 's2', 's3', 's4'],
+        max_k=15
+    ):
+    """
+    输入：Excel + 句向量 pkl
+    输出：(聚类图 base64, 统计信息 dict)
+    """
+    # 1. 加载数据
+    df = pd.read_excel(excel_path)
+    sentences = []
+    meta = []
+    for idx, row in df.iterrows():
+        for q in questions:
+            text = str(row[q]).strip()
+            if text:
+                sentences.append(text)
+                meta.append((row['no'], q))
+    emb = encode_sentences_with_cache(sentences, MODEL)
+    # with open(pkl_path, 'rb') as f:
+    #     emb = pickle.load(f)
+    # 2. 自动选 k
+    n_clusters = auto_select_k(emb, max_k=max_k)
+    # n_clusters = 8
+    # 3. 聚类
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
+    labels = kmeans.labels_
+    closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
+    # 在聚类后，替换人工关键词
+    cluster_keywords_auto = []
+    for i in range(n_clusters):
+        kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
+        cluster_keywords_auto.append(kw)
+    # 4. 统计
+    stats = []
+    total = len(sentences)
+    for i in range(n_clusters):
+        cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
+        size = len(cluster_sents)
+        rep_sent = sentences[closest[i]]
+        stats.append({
+            'cluster_id': i,
+            'size': size,
+            'ratio': size / total,
+            'rep_sentence': rep_sent,
+            'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
+        })
+    # 5. 可视化
+    tsne = TSNE(n_components=2, random_state=42)
+    emb_2d = tsne.fit_transform(emb)
+    cmap = cm.get_cmap('rainbow', n_clusters)
+    cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
+    point_colors = [cluster_colors[l] for l in labels]
+    fig = go.Figure()
+    # 数据点
+    fig.add_trace(go.Scatter(
+        x=emb_2d[:, 0], y=emb_2d[:, 1],
+        mode='markers',
+        marker=dict(size=10, color=point_colors, opacity=0.7),
+        text=[f"聚类 {l}" for l in labels],
+        hoverinfo='text',
+        showlegend=False
+    ))
+    # 聚类中心
+    center_x = emb_2d[closest, 0]
+    center_y = emb_2d[closest, 1]
+    for i, (x, y) in enumerate(zip(center_x, center_y)):
+        keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
+        fig.add_trace(go.Scatter(
+            x=[x], y=[y],
+            mode='markers+text',
+            marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
+            text=[keyword],
+            textposition="top center",
+            textfont=dict(family="SimHei", size=20, color='black'),
+            showlegend=False
+        ))
+    fig.update_layout(
+        title="EGISInsight：学生反馈聚类洞察",
+        font=dict(family="Microsoft YaHei", size=18),
+        width=900, height=600,
+        plot_bgcolor='#F5F5F5'
+    )
+    img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
+    b64 = base64.b64encode(img_bytes).decode()
+    return b64, stats