Spaces:

wxy01giser
/

egisinsight

Running

App Files Files Community

wxy01giser commited on 29 days ago

Commit

a215fac

verified ·

1 Parent(s): 6ed25c3

Upload 8 files

Browse files

Files changed (8) hide show

analyzer.py +27 -0
app.gradio.py +143 -0
cluster_insight.py +225 -0
feedback_analyzer.py +210 -0
main.py +143 -0
qwen_api.py +30 -0
requirements.txt +11 -0
sankey_plot.py +54 -0

analyzer.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# analyzer.py
+from feedback_analyzer import generate_sankey_df, DOMAIN_VOCAB
+from sankey_plot import plot_sankey_from_df
+def analyze_teacher_dashboard(excel_path: str) -> str:
+    """
+    输入：融合后的 Excel 路径
+    输出：桑基图 base64 字符串（可直接嵌入 HTML）
+    全内存流，零中间文件，零冗余返回
+    """
+    # 1. 内存生成停用词 + 桑基数据（自动缓存）
+    sankey_df = generate_sankey_df(
+        file_path=excel_path,
+        text_columns=["s1", "s2", "s3", "s4"],
+        domain_words=DOMAIN_VOCAB,
+        top_n=30
+    )
+    # 2. 内存绘图 → base64
+    sankey_b64 = plot_sankey_from_df(sankey_df, title="GIS实践教学改革方向捕捉")
+    # 3. 只返回最终结果
+    return sankey_b64
+if __name__ == '__main__':
+    sankey_b64 = analyze_teacher_dashboard(excel_path = "E:\\data\\20250621Edu\\ex02.xlsx")
+    print()

app.gradio.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# main.py
+import gradio as gr
+from analyzer import analyze_teacher_dashboard
+from cluster_insight import cluster_and_visualize
+# ================== LLM 预留接口（未来接入通义千问）==================
+# main.py → analyze_report 内
+from qwen_api import call_qwen
+def generate_teaching_advice(sankey_b64, cluster_stats):
+    # 1. 构造 Prompt（结构化、专业）
+    prompt = f"""
+你是一名GIS实验教学专家，基于以下分析结果，生成教学优化方案：
+【桑基图分析】
+- 学生反馈从 s1→s4 的主要流向：核密度 → 参数设置 → 应用场景
+- 最粗路径：核密度分析 → 搜索半径选择 → 城市规划应用
+【聚类分析】
+"""
+    for s in cluster_stats[:3]:  # 取 Top 3 聚类
+        prompt += f"- 聚类 {s['cluster_id']}：{s['keyword']}（{s['size']}条，占{s['ratio']:.1%}）\n"
+        prompt += f"  代表句：{s['rep_sentence'][:100]}\n"
+    prompt += """
+        【要求】
+        1. 诊断核心教学痛点（3条）
+        2. 提出针对性优化措施（微课/演示/作业）
+        3. 设计 1 个 2 分钟微课脚本（标题+3步演示）
+        4. 建议 1 个课后作业（验证学生掌握）
+        【输出格式】
+        # 教学优化方案
+        ## 1. 核心痛点
+        ## 2. 优化措施
+        ## 3. 微课脚本
+        ## 4. 课后作业
+        """
+    # 2. 调用通义千问
+    advice = call_qwen(prompt)
+    return f"<pre style='background:#f8f9fa; padding:15px; border-radius:8px; white-space: pre-wrap;'>{advice}</pre>"
+# ================== Gradio 界面 ==================
+def analyze_report(file):
+    if not file:
+        return "请上传 Excel 文件", None
+    try:
+        # 1. 分析 → 桑基图
+        sankey_b64 = analyze_teacher_dashboard(excel_path = file.name)
+        # 2. 聚类图
+        cluster_b64, cluster_stats  = cluster_and_visualize( excel_path=file.name )
+        # print(cluster_b64)
+        # print(cluster_stats)
+        # 3. 生成教学建议
+        advice = generate_teaching_advice(sankey_b64, cluster_stats)
+        # 4. 聚类统计表格
+        stats_table = """
+                <h3>聚类主题统计</h3>
+                <table border="1" style="width:100%; border-collapse: collapse; text-align:center; font-size:14px;">
+                    <tr style="background:#f0f0f0;">
+                        <th>聚类</th><th>主题关键词</th><th>反馈数</th><th>占比</th><th>代表句</th>
+                    </tr>
+                """
+        for s in cluster_stats:
+            stats_table += f"""
+                    <tr>
+                        <td>{s['cluster_id']}</td>
+                        <td><strong>{s['keyword']}</strong></td>
+                        <td>{s['size']}</td>
+                        <td>{s['ratio']:.1%}</td>
+                        <td style="text-align:left; max-width:300px;">{s['rep_sentence'][:60]}...</td>
+                    </tr>
+                    """
+        stats_table += "</table>"
+        # 5. 最终 HTML 输出
+        html = f"""
+                <div style="font-family: 'Microsoft YaHei', sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px;">
+                    <h1 style="text-align:center; color:#1e88e5;">EGISInsight</h1>
+                    <p style="text-align:center; color:#555; font-size:16px;">
+                        GIS 教学智能体 · 循证教学优化
+                    </p>
+                    <hr style="border: 1px solid #eee; margin: 30px 0;">
+                    <h2 style="color:#1976d2;">1. 实验报告反馈</h2>
+                    <img src="data:image/png;base64,{sankey_b64}"
+                         style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
+                    <h2 style="color:#388e3c; margin-top:40px;">2. 学生反馈聚类</h2>
+                    <img src="data:image/png;base64,{cluster_b64}"
+                         style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
+                    <div style="margin-top:30px;">
+                        {stats_table}
+                    </div>
+                    <div style="margin-top:30px; padding:20px; background:#f8f9fa; border-radius:8px; text-align:left;">
+                        {advice}
+                    </div>
+                    <p style="text-align:center; color:#999; font-size:13px; margin-top:40px;">
+                        EGISInsight © 2025 | 从数据到教学内容改革
+                    </p>
+                </div>
+                """
+        return html, None
+    except Exception as e:
+        return f"分析失败：{str(e)}", None
+# ================== Gradio UI ==================
+with gr.Blocks(title="教学智能体 · 实验报告分析") as demo:
+    gr.Markdown("# GIS实验报告智能分析系统")
+    gr.Markdown("**上传融合后的学生反馈 Excel → 一键生成教学决策图**")
+    with gr.Row():
+        file_input = gr.File(
+            label="上传 ex02.xlsx（含 s1-s4 列）",
+            file_types=[".xlsx"]
+        )
+    with gr.Row():
+        output = gr.HTML(label="分析结果")
+    file_input.change(analyze_report, inputs=file_input, outputs=output)
+    gr.Markdown("---")
+    gr.Markdown("**后续将接入通义千问大模型，自动生成教案、微课脚本、作业设计**")
+# ================== 启动 ==================
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        # share=False  # 改 True 可生成公网链接
+        share=True  # 改 True 可生成公网链接
+    )

cluster_insight.py ADDED Viewed

	@@ -0,0 +1,225 @@

+'''
+基于向量聚类方式的可视化模块
+'''
+# cluster_insight.py
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from sklearn.manifold import TSNE
+import plotly.graph_objects as go
+import matplotlib.cm as cm
+import matplotlib.colors as mcolors
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import base64
+from io import BytesIO
+from sentence_transformers import SentenceTransformer, util
+import os
+import pickle
+# cluster_insight.py → 新增函数
+from sklearn.feature_extraction.text import TfidfVectorizer
+import jieba
+def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
+    """
+    自动提取聚类关键词
+    :param sentences: 所有句子
+    :param labels: 聚类标签
+    :param cluster_id: 当前聚类
+    :param top_n: 提取前 n 个词
+    :return: 关键词字符串
+    """
+    # 1. 提取该聚类所有句子
+    cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
+    if not cluster_texts:
+        return "无数据"
+    # 2. 分词（保护领域词）
+    DOMAIN_SET = {
+    # 中文通用领域词（去重 + 合并）
+    "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
+    "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
+    "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
+    "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
+    "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
+    "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
+    # 英文通用领域词（去重）
+    "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
+    "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
+    "spatial join", "data manager", "POI",
+    }
+    for word in DOMAIN_SET:
+        jieba.add_word(word, freq=10000)
+        # 1. 基础停用词（通用功能词）
+    STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
+                 "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
+                 "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
+                 "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
+                 "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
+                 }
+    def tokenize(text):
+        words = jieba.lcut(text)
+        return [
+            w for w in words
+            if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
+        ]
+    tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
+    # 3. TF-IDF 提取关键词
+    vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
+    try:
+        tfidf_matrix = vectorizer.fit_transform(tokenized)
+        feature_names = vectorizer.get_feature_names_out()
+        # 取平均 TF-IDF 最高的词
+        mean_tfidf = tfidf_matrix.mean(axis=0).A1
+        top_indices = mean_tfidf.argsort()[-top_n:][::-1]
+        keywords = [feature_names[i] for i in top_indices]
+        return " | ".join(keywords)
+    except:
+        return "关键词提取失败"
+model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
+# 从本地加载模型
+MODEL = SentenceTransformer(model_path)
+def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
+    """
+    使用 BERT 编码句子，并支持本地缓存
+    """
+    # if os.path.exists(cache_path):
+    #     print(f"已找到缓存文件：{cache_path}，正在加载向量...")
+    #     with open(cache_path, 'rb') as f:
+    #         sentence_vectors = pickle.load(f)
+    # else:
+    #     print("未找到缓存，开始编码...")
+    sentence_vectors = model.encode(
+        sentences,
+        batch_size=16,
+        show_progress_bar=True,
+        convert_to_tensor=False
+    )
+        # print(f"编码完成，保存到：{cache_path}")
+        # with open(cache_path, 'wb') as f:
+        #     pickle.dump(sentence_vectors, f)
+    return sentence_vectors
+def auto_select_k(embeddings, max_k=10):
+    """自动选择最佳聚类数（轮廓系数最高）"""
+    sil_scores = []
+    k_range = range(2, min(max_k + 1, len(embeddings) // 2))
+    for k in k_range:
+        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+        labels = kmeans.fit_predict(embeddings)
+        sil_scores.append(silhouette_score(embeddings, labels))
+    best_k = k_range[np.argmax(sil_scores)]
+    print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
+    return best_k
+def cluster_and_visualize(
+        excel_path: str,
+        questions=['s1', 's2', 's3', 's4'],
+        max_k=15
+    ):
+    """
+    输入：Excel + 句向量 pkl
+    输出：(聚类图 base64, 统计信息 dict)
+    """
+    # 1. 加载数据
+    df = pd.read_excel(excel_path)
+    sentences = []
+    meta = []
+    for idx, row in df.iterrows():
+        for q in questions:
+            text = str(row[q]).strip()
+            if text:
+                sentences.append(text)
+                meta.append((row['no'], q))
+    emb = encode_sentences_with_cache(sentences, MODEL)
+    # with open(pkl_path, 'rb') as f:
+    #     emb = pickle.load(f)
+    # 2. 自动选 k
+    n_clusters = auto_select_k(emb, max_k=max_k)
+    # n_clusters = 8
+    # 3. 聚类
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
+    labels = kmeans.labels_
+    closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
+    # 在聚类后，替换人工关键词
+    cluster_keywords_auto = []
+    for i in range(n_clusters):
+        kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
+        cluster_keywords_auto.append(kw)
+    # 4. 统计
+    stats = []
+    total = len(sentences)
+    for i in range(n_clusters):
+        cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
+        size = len(cluster_sents)
+        rep_sent = sentences[closest[i]]
+        stats.append({
+            'cluster_id': i,
+            'size': size,
+            'ratio': size / total,
+            'rep_sentence': rep_sent,
+            'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
+        })
+    # 5. 可视化
+    tsne = TSNE(n_components=2, random_state=42)
+    emb_2d = tsne.fit_transform(emb)
+    cmap = cm.get_cmap('rainbow', n_clusters)
+    cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
+    point_colors = [cluster_colors[l] for l in labels]
+    fig = go.Figure()
+    # 数据点
+    fig.add_trace(go.Scatter(
+        x=emb_2d[:, 0], y=emb_2d[:, 1],
+        mode='markers',
+        marker=dict(size=10, color=point_colors, opacity=0.7),
+        text=[f"聚类 {l}" for l in labels],
+        hoverinfo='text',
+        showlegend=False
+    ))
+    # 聚类中心
+    center_x = emb_2d[closest, 0]
+    center_y = emb_2d[closest, 1]
+    for i, (x, y) in enumerate(zip(center_x, center_y)):
+        keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
+        fig.add_trace(go.Scatter(
+            x=[x], y=[y],
+            mode='markers+text',
+            marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
+            text=[keyword],
+            textposition="top center",
+            textfont=dict(family="SimHei", size=20, color='black'),
+            showlegend=False
+        ))
+    fig.update_layout(
+        title="EGISInsight：学生反馈聚类洞察",
+        font=dict(family="Microsoft YaHei", size=18),
+        width=900, height=600,
+        plot_bgcolor='#F5F5F5'
+    )
+    img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
+    b64 = base64.b64encode(img_bytes).decode()
+    return b64, stats

feedback_analyzer.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+生成停用词和桑基图数据
+"""
+import pandas as pd
+import jieba
+import re
+from collections import Counter
+import networkx as nx
+import matplotlib.pyplot as plt
+import numpy as np
+import warnings
+from matplotlib.font_manager import FontProperties
+import os
+warnings.filterwarnings('ignore')
+from collections import defaultdict
+from sklearn.feature_extraction.text import CountVectorizer
+# domain_vocab.py
+DOMAIN_VOCAB = [
+    # 中文通用领域词（去重 + 合并）
+    "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
+    "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
+    "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
+    "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
+    "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
+    "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
+    # 英文通用领域词（去重）
+    "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
+    "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
+    "spatial join", "data manager", "POI",
+]
+def generate_domain_stopwords(df, text_columns, domain_keywords):
+    """结合领域关键词生成停用词表"""
+    # 1. 基础停用词（通用功能词）
+    common_stopwords = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么", "怎么",
+                        "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点", "一点",
+                        "一些","进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "进行", "对于", "实际", "情况", "结合",  "对于",
+                        "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
+                        "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
+                        }
+    # 2. 领域关键词（如果有）
+    domain_words = set(domain_keywords)
+    # if domain_keywords_file and os.path.exists(domain_keywords_file):
+    #     with open(domain_keywords_file, "r", encoding="utf-8") as f:
+    #         domain_words = set([line.strip() for line in f if line.strip()])
+    # 3. 合并文本并预处理
+    all_text = ""
+    for col in text_columns:
+        all_text += " ".join(df[col].fillna("").astype(str))
+    all_text = re.sub(r"[^\w\s]", "", all_text)
+    all_text = re.sub(r"\d+", "", all_text)
+    all_text = all_text.lower()  # 统一英文为小写（新增）
+    words = jieba.lcut(all_text)
+    # 4. 统计词频
+    word_freq = Counter(words)
+    word_freq = {word: freq for word, freq in word_freq.items() if len(word) > 1}
+    # 5. 生成候选停用词
+    # - 通用功能词
+    # - 高频但非领域关键词的词汇（出现频率>50%且不在领域词表中）
+    stopwords = common_stopwords.copy()
+    for word, freq in word_freq.items():
+        if freq > len(df) * 0.5 and word not in domain_words:
+            stopwords.add(word)
+    # 6. 保存停用词表
+    # with open(output_path, "w", encoding="utf-8") as f:
+    #     for word in stopwords:
+    #         f.write(word + "\n")
+    # print(f"领域定制化停用词表已生成，共{len(stopwords)}个词，保存至{output_path}")
+    return stopwords
+def load_and_preprocess_data(df, stopwords, domain_words):
+    """加载数据并进行预处理（优化专业词汇识别）"""
+    stopwords = set(stopwords)
+    # with open(stopwords_path, "r", encoding="utf-8") as f:
+    #     for line in f:
+    #         stopwords.add(line.strip())
+    question_types = {
+        "s1": "难点",
+        "s2": "讲解需求",
+        "s3": "操作疑惑",
+        "s4": "应用场景"
+    }
+    # 定义问题因果顺序（索引越小越靠前）
+    question_hierarchy = ["s1", "s2", "s3", "s4"]
+    # 创建专业词汇词典（提前加载所有领域词）
+    professional_dict = {}
+    # 添加域词词汇
+    for word in domain_words:
+        jieba.add_word(word, freq=10000)  # 高频率确保优先识别
+        professional_dict[word] = 1
+    # 创建专业词汇的正则表达式模式
+    # 按长度降序排序，确保长词优先匹配
+    sorted_domain = sorted(domain_words, key=len, reverse=True)
+    pattern_str = "|".join(re.escape(word) for word in sorted_domain)
+    professional_pattern = re.compile(f"({pattern_str})")
+    def clean_text(text):
+        if not isinstance(text, str):
+            return []
+        # 第一步：基础清洗
+        text_cleaned = re.sub(r"[^\w\s]", "", text)
+        text_cleaned = re.sub(r"\d+", "", text_cleaned)
+        text_cleaned = text_cleaned.lower()
+        # 第二步：专业词汇识别和标记
+        # 找到所有专业词汇的位置
+        matches = []
+        for match in professional_pattern.finditer(text_cleaned):
+            start, end = match.span()
+            matches.append((start, end, text_cleaned[start:end]))
+        # 第三步：分割文本（保护专业词汇）
+        segments = []
+        last_end = 0
+        # 按匹配位置分割文本
+        for start, end, word in matches:
+            # 添加前面的普通文本
+            if start > last_end:
+                segments.append(text_cleaned[last_end:start])
+            # 添加专业词汇（作为整体）
+            segments.append(word)
+            last_end = end
+        # 添加最后一段文本
+        if last_end < len(text_cleaned):
+            segments.append(text_cleaned[last_end:])
+        # 第四步：对非专业词汇部分进行分词
+        final_words = []
+        for segment in segments:
+            if segment in professional_dict:
+                # 专业词汇直接添加
+                final_words.append(segment)
+            else:
+                # 普通文本进行分词
+                words = jieba.lcut(segment)
+                words = [w for w in words if w not in stopwords and len(w) > 1]
+                final_words.extend(words)
+        return final_words
+    for col in ["s1", "s2", "s3", "s4"]:
+        df[col + "_words"] = df[col].apply(clean_text)
+    return df, question_types, question_hierarchy
+def build_sankey_data(df, question_columns, top_n=30):
+    """
+    构建桑基图用的数据（DataFrame: source, target, value）
+    - 仅保留前 top_n 个全局高频关键词
+    """
+    question_labels = {
+        "s1": "S1_难点",
+        "s2": "S2_讲解需求",
+        "s3": "S3_操作疑惑",
+        "s4": "S4_应用场景"
+    }
+    # 1. 统计全局关键词频率
+    all_keywords = []
+    for col in question_columns:
+        all_keywords.extend([kw for kws in df[col + "_words"] for kw in kws])
+    keyword_freq = Counter(all_keywords)
+    core_keywords = set([kw for kw, _ in keyword_freq.most_common(top_n)])
+    # 2. 构建桑基图原始数据
+    link_counter = Counter()
+    for _, row in df.iterrows():
+        for q in question_columns:
+            q_label = question_labels[q]
+            keywords = row[q + "_words"]
+            for kw in keywords:
+                if kw in core_keywords:
+                    link_counter[(q_label, kw)] += 1
+    # 3. 转为 DataFrame
+    sankey_data = pd.DataFrame([
+        {"source": src, "target": tgt, "value": val}
+        for (src, tgt), val in link_counter.items()
+    ])
+    # sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data.csv", index=False, encoding='utf-8-sig')
+    # sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data2.csv", index=False, encoding='utf-8-sig')
+    print("桑基图保存成功!")
+    return sankey_data
+def generate_sankey_df(file_path, text_columns, domain_words, top_n=30):
+    df = pd.read_excel(file_path)
+    stopwords = generate_domain_stopwords(df, text_columns, domain_words)
+    df, question_types, question_hierarchy = load_and_preprocess_data(df, stopwords, domain_words)
+    sankey_data = build_sankey_data(df, text_columns, top_n)
+    return sankey_data
+if __name__ == '__main__':
+    """主函数：执行完整的分析流程"""
+    file_path = "E:\\data\\20250621Edu\\ex02.xlsx"
+    stopwords_path = "E:\\data\\20250621Edu\\stop\\stop2.txt"
+    text_columns = ["s1", "s2", "s3", "s4"]
+    top_n = 30 # 设置频数排在前30的关键词
+    sankey_data = generate_sankey_df(file_path, text_columns, DOMAIN_VOCAB, top_n)

main.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# main.py
+import gradio as gr
+from analyzer import analyze_teacher_dashboard
+from cluster_insight import cluster_and_visualize
+# ================== LLM 预留接口（未来接入通义千问）==================
+# main.py → analyze_report 内
+from qwen_api import call_qwen
+def generate_teaching_advice(sankey_b64, cluster_stats):
+    # 1. 构造 Prompt（结构化、专业）
+    prompt = f"""
+你是一名GIS实验教学专家，基于以下分析结果，生成教学优化方案：
+【桑基图分析】
+- 学生反馈从 s1→s4 的主要流向：核密度 → 参数设置 → 应用场景
+- 最粗路径：核密度分析 → 搜索半径选择 → 城市规划应用
+【聚类分析】
+"""
+    for s in cluster_stats[:3]:  # 取 Top 3 聚类
+        prompt += f"- 聚类 {s['cluster_id']}：{s['keyword']}（{s['size']}条，占{s['ratio']:.1%}）\n"
+        prompt += f"  代表句：{s['rep_sentence'][:100]}\n"
+    prompt += """
+        【要求】
+        1. 诊断核心教学痛点（3条）
+        2. 提出针对性优化措施（微课/演示/作业）
+        3. 设计 1 个 2 分钟微课脚本（标题+3步演示）
+        4. 建议 1 个课后作业（验证学生掌握）
+        【输出格式】
+        # 教学优化方案
+        ## 1. 核心痛点
+        ## 2. 优化措施
+        ## 3. 微课脚本
+        ## 4. 课后作业
+        """
+    # 2. 调用通义千问
+    advice = call_qwen(prompt)
+    return f"<pre style='background:#f8f9fa; padding:15px; border-radius:8px; white-space: pre-wrap;'>{advice}</pre>"
+# ================== Gradio 界面 ==================
+def analyze_report(file):
+    if not file:
+        return "请上传 Excel 文件", None
+    try:
+        # 1. 分析 → 桑基图
+        sankey_b64 = analyze_teacher_dashboard(excel_path = file.name)
+        # 2. 聚类图
+        cluster_b64, cluster_stats  = cluster_and_visualize( excel_path=file.name )
+        # print(cluster_b64)
+        # print(cluster_stats)
+        # 3. 生成教学建议
+        advice = generate_teaching_advice(sankey_b64, cluster_stats)
+        # 4. 聚类统计表格
+        stats_table = """
+                <h3>聚类主题统计</h3>
+                <table border="1" style="width:100%; border-collapse: collapse; text-align:center; font-size:14px;">
+                    <tr style="background:#f0f0f0;">
+                        <th>聚类</th><th>主题关键词</th><th>反馈数</th><th>占比</th><th>代表句</th>
+                    </tr>
+                """
+        for s in cluster_stats:
+            stats_table += f"""
+                    <tr>
+                        <td>{s['cluster_id']}</td>
+                        <td><strong>{s['keyword']}</strong></td>
+                        <td>{s['size']}</td>
+                        <td>{s['ratio']:.1%}</td>
+                        <td style="text-align:left; max-width:300px;">{s['rep_sentence'][:60]}...</td>
+                    </tr>
+                    """
+        stats_table += "</table>"
+        # 5. 最终 HTML 输出
+        html = f"""
+                <div style="font-family: 'Microsoft YaHei', sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px;">
+                    <h1 style="text-align:center; color:#1e88e5;">EGISInsight</h1>
+                    <p style="text-align:center; color:#555; font-size:16px;">
+                        GIS 教学智能体 · 循证教学优化
+                    </p>
+                    <hr style="border: 1px solid #eee; margin: 30px 0;">
+                    <h2 style="color:#1976d2;">1. 实验报告反馈</h2>
+                    <img src="data:image/png;base64,{sankey_b64}"
+                         style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
+                    <h2 style="color:#388e3c; margin-top:40px;">2. 学生反馈聚类</h2>
+                    <img src="data:image/png;base64,{cluster_b64}"
+                         style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
+                    <div style="margin-top:30px;">
+                        {stats_table}
+                    </div>
+                    <div style="margin-top:30px; padding:20px; background:#f8f9fa; border-radius:8px; text-align:left;">
+                        {advice}
+                    </div>
+                    <p style="text-align:center; color:#999; font-size:13px; margin-top:40px;">
+                        EGISInsight © 2025 | 从数据到教学内容改革
+                    </p>
+                </div>
+                """
+        return html, None
+    except Exception as e:
+        return f"分析失败：{str(e)}", None
+# ================== Gradio UI ==================
+with gr.Blocks(title="教学智能体 · 实验报告分析") as demo:
+    gr.Markdown("# GIS实验报告智能分析系统")
+    gr.Markdown("**上传融合后的学生反馈 Excel → 一键生成教学决策图**")
+    with gr.Row():
+        file_input = gr.File(
+            label="上传 ex02.xlsx（含 s1-s4 列）",
+            file_types=[".xlsx"]
+        )
+    with gr.Row():
+        output = gr.HTML(label="分析结果")
+    file_input.change(analyze_report, inputs=file_input, outputs=output)
+    gr.Markdown("---")
+    gr.Markdown("**后续将接入通义千问大模型，自动生成教案、微课脚本、作业设计**")
+# ================== 启动 ==================
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        # share=False  # 改 True 可生成公网链接
+        share=True  # 改 True 可生成公网链接
+    )

qwen_api.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# qwen_api.py
+import requests
+import json
+QWEN_API_KEY = "sk-6bb5d1eb5eab468ba4e0b38451526fae"  # sk-6bb5d1eb5eab468ba4e0b38451526fae
+QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"
+def call_qwen(prompt: str) -> str:
+    headers = {
+        "Authorization": f"Bearer {QWEN_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": "qwen-plus",
+        "input": {"prompt": prompt},
+        "parameters": {
+            "result_format": "text",
+            "temperature": 0.7,
+            "top_p": 0.8
+        }
+    }
+    try:
+        resp = requests.post(QWEN_API_URL, headers=headers, json=payload, timeout=30)
+        result = resp.json()
+        return result['output']['text']
+    except Exception as e:
+        return f"【大模型调用失败】{str(e)}"

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==4.26.0
+plotly
+kaleido
+pandas
+openpyxl
+scikit-learn
+jieba
+sentence-transformers
+requests
+torch
+transformers

sankey_plot.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# sankey_plot.py
+import plotly.graph_objects as go
+import matplotlib.cm as cm
+import matplotlib.colors as mcolors
+import base64
+import pandas as pd
+def plot_sankey_from_df(sankey_df: pd.DataFrame, title="问题 → 关键词共现") -> str:
+    if sankey_df.empty:
+        return "无数据"
+    # 过滤 top targets
+    top_targets = sankey_df.groupby('target')['value'].sum().sort_values(ascending=False).head(15).index
+    df = sankey_df[sankey_df['target'].isin(top_targets)].copy()
+    # 节点顺序
+    sources = ['S4_应用场景', 'S3_操作疑惑', 'S2_讲解需求', 'S1_难点']
+    sources = [s for s in sources if s in df['source'].unique()]
+    targets = top_targets.tolist()
+    all_nodes = sources + targets
+    node_index = {n: i for i, n in enumerate(all_nodes)}
+    # 颜色
+    source_color_map = {
+        'S1_难点': '#345DA7', 'S2_讲解需求': '#3B8AC4',
+        'S3_操作疑惑': '#4BB4DE', 'S4_应用场景': '#EFDBCB'
+    }
+    cmap = cm.get_cmap('Set3', len(targets))
+    target_colors = [mcolors.to_hex(cmap(i)) for i in range(len(targets))]
+    target_color_map = dict(zip(targets, target_colors))
+    node_colors = [source_color_map.get(n, target_color_map.get(n, '#gray')) for n in all_nodes]
+    link_colors = [target_color_map.get(t, '#gray') for t in df['target']]
+    fig = go.Figure(data=[go.Sankey(
+        node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5),
+                  label=all_nodes, color=node_colors),
+        link=dict(
+            source=df['source'].map(node_index),
+            target=df['target'].map(node_index),
+            value=df['value'],
+            color=link_colors
+        )
+    )])
+    fig.update_layout(title_text=title, font=dict(family="Microsoft YaHei", size=18), width=900, height=600,
+                      margin=dict(l=20, r=20, t=60, b=20))
+    # === 5. 导出高清 PNG（关键！）===
+    img_bytes = fig.to_image(
+        format="png",
+        width=900,
+        height=600,
+        scale=2  # 2倍 DPI → 超清！
+    )
+    return base64.b64encode(img_bytes).decode()