Spaces:
Running
Running
Upload 8 files
Browse files- analyzer.py +27 -0
- app.gradio.py +143 -0
- cluster_insight.py +225 -0
- feedback_analyzer.py +210 -0
- main.py +143 -0
- qwen_api.py +30 -0
- requirements.txt +11 -0
- sankey_plot.py +54 -0
analyzer.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# analyzer.py
|
| 2 |
+
from feedback_analyzer import generate_sankey_df, DOMAIN_VOCAB
|
| 3 |
+
from sankey_plot import plot_sankey_from_df
|
| 4 |
+
|
| 5 |
+
def analyze_teacher_dashboard(excel_path: str) -> str:
|
| 6 |
+
"""
|
| 7 |
+
输入:融合后的 Excel 路径
|
| 8 |
+
输出:桑基图 base64 字符串(可直接嵌入 HTML)
|
| 9 |
+
全内存流,零中间文件,零冗余返回
|
| 10 |
+
"""
|
| 11 |
+
# 1. 内存生成停用词 + 桑基数据(自动缓存)
|
| 12 |
+
sankey_df = generate_sankey_df(
|
| 13 |
+
file_path=excel_path,
|
| 14 |
+
text_columns=["s1", "s2", "s3", "s4"],
|
| 15 |
+
domain_words=DOMAIN_VOCAB,
|
| 16 |
+
top_n=30
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# 2. 内存绘图 → base64
|
| 20 |
+
sankey_b64 = plot_sankey_from_df(sankey_df, title="GIS实践教学改革方向捕捉")
|
| 21 |
+
|
| 22 |
+
# 3. 只返回最终结果
|
| 23 |
+
return sankey_b64
|
| 24 |
+
|
| 25 |
+
if __name__ == '__main__':
|
| 26 |
+
sankey_b64 = analyze_teacher_dashboard(excel_path = "E:\\data\\20250621Edu\\ex02.xlsx")
|
| 27 |
+
print()
|
app.gradio.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from analyzer import analyze_teacher_dashboard
|
| 4 |
+
from cluster_insight import cluster_and_visualize
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# ================== LLM 预留接口(未来接入通义千问)==================
|
| 8 |
+
# main.py → analyze_report 内
|
| 9 |
+
from qwen_api import call_qwen
|
| 10 |
+
|
| 11 |
+
def generate_teaching_advice(sankey_b64, cluster_stats):
|
| 12 |
+
# 1. 构造 Prompt(结构化、专业)
|
| 13 |
+
prompt = f"""
|
| 14 |
+
你是一名GIS实验教学专家,基于以下分析结果,生成教学优化方案:
|
| 15 |
+
|
| 16 |
+
【桑基图分析】
|
| 17 |
+
- 学生反馈从 s1→s4 的主要流向:核密度 → 参数设置 → 应用场景
|
| 18 |
+
- 最粗路径:核密度分析 → 搜索半径选择 → 城市规划应用
|
| 19 |
+
|
| 20 |
+
【聚类分析】
|
| 21 |
+
"""
|
| 22 |
+
for s in cluster_stats[:3]: # 取 Top 3 聚类
|
| 23 |
+
prompt += f"- 聚类 {s['cluster_id']}:{s['keyword']}({s['size']}条,占{s['ratio']:.1%})\n"
|
| 24 |
+
prompt += f" 代表句:{s['rep_sentence'][:100]}\n"
|
| 25 |
+
|
| 26 |
+
prompt += """
|
| 27 |
+
【要求】
|
| 28 |
+
1. 诊断核心教学痛点(3条)
|
| 29 |
+
2. 提出针对性优化措施(微课/演示/作业)
|
| 30 |
+
3. 设计 1 个 2 分钟微课脚本(标题+3步演示)
|
| 31 |
+
4. 建议 1 个课后作业(验证学生掌握)
|
| 32 |
+
|
| 33 |
+
【输出格式】
|
| 34 |
+
# 教学优化方案
|
| 35 |
+
## 1. 核心痛点
|
| 36 |
+
## 2. 优化措施
|
| 37 |
+
## 3. 微课脚本
|
| 38 |
+
## 4. 课后作业
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
# 2. 调用通义千问
|
| 42 |
+
advice = call_qwen(prompt)
|
| 43 |
+
return f"<pre style='background:#f8f9fa; padding:15px; border-radius:8px; white-space: pre-wrap;'>{advice}</pre>"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ================== Gradio 界面 ==================
|
| 47 |
+
def analyze_report(file):
|
| 48 |
+
if not file:
|
| 49 |
+
return "请上传 Excel 文件", None
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
# 1. 分析 → 桑基图
|
| 53 |
+
sankey_b64 = analyze_teacher_dashboard(excel_path = file.name)
|
| 54 |
+
# 2. 聚类图
|
| 55 |
+
cluster_b64, cluster_stats = cluster_and_visualize( excel_path=file.name )
|
| 56 |
+
# print(cluster_b64)
|
| 57 |
+
# print(cluster_stats)
|
| 58 |
+
# 3. 生成教学建议
|
| 59 |
+
advice = generate_teaching_advice(sankey_b64, cluster_stats)
|
| 60 |
+
|
| 61 |
+
# 4. 聚类统计表格
|
| 62 |
+
stats_table = """
|
| 63 |
+
<h3>聚类主题统计</h3>
|
| 64 |
+
<table border="1" style="width:100%; border-collapse: collapse; text-align:center; font-size:14px;">
|
| 65 |
+
<tr style="background:#f0f0f0;">
|
| 66 |
+
<th>聚类</th><th>主题关键词</th><th>反馈数</th><th>占比</th><th>代表句</th>
|
| 67 |
+
</tr>
|
| 68 |
+
"""
|
| 69 |
+
for s in cluster_stats:
|
| 70 |
+
stats_table += f"""
|
| 71 |
+
<tr>
|
| 72 |
+
<td>{s['cluster_id']}</td>
|
| 73 |
+
<td><strong>{s['keyword']}</strong></td>
|
| 74 |
+
<td>{s['size']}</td>
|
| 75 |
+
<td>{s['ratio']:.1%}</td>
|
| 76 |
+
<td style="text-align:left; max-width:300px;">{s['rep_sentence'][:60]}...</td>
|
| 77 |
+
</tr>
|
| 78 |
+
"""
|
| 79 |
+
stats_table += "</table>"
|
| 80 |
+
|
| 81 |
+
# 5. 最终 HTML 输出
|
| 82 |
+
html = f"""
|
| 83 |
+
<div style="font-family: 'Microsoft YaHei', sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px;">
|
| 84 |
+
<h1 style="text-align:center; color:#1e88e5;">EGISInsight</h1>
|
| 85 |
+
<p style="text-align:center; color:#555; font-size:16px;">
|
| 86 |
+
GIS 教学智能体 · 循证教学优化
|
| 87 |
+
</p>
|
| 88 |
+
<hr style="border: 1px solid #eee; margin: 30px 0;">
|
| 89 |
+
|
| 90 |
+
<h2 style="color:#1976d2;">1. 实验报告反馈</h2>
|
| 91 |
+
<img src="data:image/png;base64,{sankey_b64}"
|
| 92 |
+
style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
|
| 93 |
+
|
| 94 |
+
<h2 style="color:#388e3c; margin-top:40px;">2. 学生反馈聚类</h2>
|
| 95 |
+
<img src="data:image/png;base64,{cluster_b64}"
|
| 96 |
+
style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
|
| 97 |
+
|
| 98 |
+
<div style="margin-top:30px;">
|
| 99 |
+
{stats_table}
|
| 100 |
+
</div>
|
| 101 |
+
|
| 102 |
+
<div style="margin-top:30px; padding:20px; background:#f8f9fa; border-radius:8px; text-align:left;">
|
| 103 |
+
{advice}
|
| 104 |
+
</div>
|
| 105 |
+
|
| 106 |
+
<p style="text-align:center; color:#999; font-size:13px; margin-top:40px;">
|
| 107 |
+
EGISInsight © 2025 | 从数据到教学内容改革
|
| 108 |
+
</p>
|
| 109 |
+
</div>
|
| 110 |
+
"""
|
| 111 |
+
return html, None
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
return f"分析失败:{str(e)}", None
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ================== Gradio UI ==================
|
| 118 |
+
with gr.Blocks(title="教学智能体 · 实验报告分析") as demo:
|
| 119 |
+
gr.Markdown("# GIS实验报告智能分析系统")
|
| 120 |
+
gr.Markdown("**上传融合后的学生反馈 Excel → 一键生成教学决策图**")
|
| 121 |
+
|
| 122 |
+
with gr.Row():
|
| 123 |
+
file_input = gr.File(
|
| 124 |
+
label="上传 ex02.xlsx(含 s1-s4 列)",
|
| 125 |
+
file_types=[".xlsx"]
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
with gr.Row():
|
| 129 |
+
output = gr.HTML(label="分析结果")
|
| 130 |
+
|
| 131 |
+
file_input.change(analyze_report, inputs=file_input, outputs=output)
|
| 132 |
+
|
| 133 |
+
gr.Markdown("---")
|
| 134 |
+
gr.Markdown("**后续将接入通义千问大模型,自动生成教案、微课脚本、作业设计**")
|
| 135 |
+
|
| 136 |
+
# ================== 启动 ==================
|
| 137 |
+
if __name__ == "__main__":
|
| 138 |
+
demo.launch(
|
| 139 |
+
server_name="0.0.0.0",
|
| 140 |
+
server_port=7860,
|
| 141 |
+
# share=False # 改 True 可生成公网链接
|
| 142 |
+
share=True # 改 True 可生成公网链接
|
| 143 |
+
)
|
cluster_insight.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
基于向量聚类方式的可视化模块
|
| 3 |
+
'''
|
| 4 |
+
# cluster_insight.py
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from sklearn.cluster import KMeans
|
| 8 |
+
from sklearn.metrics import silhouette_score
|
| 9 |
+
from sklearn.manifold import TSNE
|
| 10 |
+
import plotly.graph_objects as go
|
| 11 |
+
import matplotlib.cm as cm
|
| 12 |
+
import matplotlib.colors as mcolors
|
| 13 |
+
from wordcloud import WordCloud
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
import base64
|
| 16 |
+
from io import BytesIO
|
| 17 |
+
from sentence_transformers import SentenceTransformer, util
|
| 18 |
+
import os
|
| 19 |
+
import pickle
|
| 20 |
+
# cluster_insight.py → 新增函数
|
| 21 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 22 |
+
import jieba
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
|
| 26 |
+
"""
|
| 27 |
+
自动提取聚类关键词
|
| 28 |
+
:param sentences: 所有句子
|
| 29 |
+
:param labels: 聚类标签
|
| 30 |
+
:param cluster_id: 当前聚类
|
| 31 |
+
:param top_n: 提取前 n 个词
|
| 32 |
+
:return: 关键词字符串
|
| 33 |
+
"""
|
| 34 |
+
# 1. 提取该聚类所有句子
|
| 35 |
+
cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
|
| 36 |
+
if not cluster_texts:
|
| 37 |
+
return "无数据"
|
| 38 |
+
|
| 39 |
+
# 2. 分词(保护领域词)
|
| 40 |
+
DOMAIN_SET = {
|
| 41 |
+
# 中文通用领域词(去重 + 合并)
|
| 42 |
+
"空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
|
| 43 |
+
"数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
|
| 44 |
+
"地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
|
| 45 |
+
"核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
|
| 46 |
+
"地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
|
| 47 |
+
"参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
|
| 48 |
+
|
| 49 |
+
# 英文通用领域词(去重)
|
| 50 |
+
"ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
|
| 51 |
+
"Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
|
| 52 |
+
"spatial join", "data manager", "POI",
|
| 53 |
+
}
|
| 54 |
+
for word in DOMAIN_SET:
|
| 55 |
+
jieba.add_word(word, freq=10000)
|
| 56 |
+
# 1. 基础停用词(通用功能词)
|
| 57 |
+
STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
|
| 58 |
+
"怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
|
| 59 |
+
"一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
|
| 60 |
+
"学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
|
| 61 |
+
"建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
|
| 62 |
+
}
|
| 63 |
+
def tokenize(text):
|
| 64 |
+
words = jieba.lcut(text)
|
| 65 |
+
return [
|
| 66 |
+
w for w in words
|
| 67 |
+
if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
|
| 71 |
+
|
| 72 |
+
# 3. TF-IDF 提取关键词
|
| 73 |
+
vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
|
| 74 |
+
try:
|
| 75 |
+
tfidf_matrix = vectorizer.fit_transform(tokenized)
|
| 76 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 77 |
+
|
| 78 |
+
# 取平均 TF-IDF 最高的词
|
| 79 |
+
mean_tfidf = tfidf_matrix.mean(axis=0).A1
|
| 80 |
+
top_indices = mean_tfidf.argsort()[-top_n:][::-1]
|
| 81 |
+
keywords = [feature_names[i] for i in top_indices]
|
| 82 |
+
return " | ".join(keywords)
|
| 83 |
+
except:
|
| 84 |
+
return "关键词提取失败"
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
|
| 88 |
+
# 从本地加载模型
|
| 89 |
+
MODEL = SentenceTransformer(model_path)
|
| 90 |
+
|
| 91 |
+
def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
|
| 92 |
+
"""
|
| 93 |
+
使用 BERT 编码句子,并支持本地缓存
|
| 94 |
+
"""
|
| 95 |
+
# if os.path.exists(cache_path):
|
| 96 |
+
# print(f"已找到缓存文件:{cache_path},正在加载向量...")
|
| 97 |
+
# with open(cache_path, 'rb') as f:
|
| 98 |
+
# sentence_vectors = pickle.load(f)
|
| 99 |
+
# else:
|
| 100 |
+
# print("未找到缓存,开始编码...")
|
| 101 |
+
sentence_vectors = model.encode(
|
| 102 |
+
sentences,
|
| 103 |
+
batch_size=16,
|
| 104 |
+
show_progress_bar=True,
|
| 105 |
+
convert_to_tensor=False
|
| 106 |
+
)
|
| 107 |
+
# print(f"编码完成,保存到:{cache_path}")
|
| 108 |
+
# with open(cache_path, 'wb') as f:
|
| 109 |
+
# pickle.dump(sentence_vectors, f)
|
| 110 |
+
|
| 111 |
+
return sentence_vectors
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def auto_select_k(embeddings, max_k=10):
|
| 115 |
+
"""自动选择最佳聚类数(轮廓系数最高)"""
|
| 116 |
+
sil_scores = []
|
| 117 |
+
k_range = range(2, min(max_k + 1, len(embeddings) // 2))
|
| 118 |
+
for k in k_range:
|
| 119 |
+
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 120 |
+
labels = kmeans.fit_predict(embeddings)
|
| 121 |
+
sil_scores.append(silhouette_score(embeddings, labels))
|
| 122 |
+
|
| 123 |
+
best_k = k_range[np.argmax(sil_scores)]
|
| 124 |
+
print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
|
| 125 |
+
return best_k
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def cluster_and_visualize(
|
| 129 |
+
excel_path: str,
|
| 130 |
+
questions=['s1', 's2', 's3', 's4'],
|
| 131 |
+
max_k=15
|
| 132 |
+
):
|
| 133 |
+
"""
|
| 134 |
+
输入:Excel + 句向量 pkl
|
| 135 |
+
输出:(聚类图 base64, 统计信息 dict)
|
| 136 |
+
"""
|
| 137 |
+
# 1. 加载数据
|
| 138 |
+
df = pd.read_excel(excel_path)
|
| 139 |
+
sentences = []
|
| 140 |
+
meta = []
|
| 141 |
+
for idx, row in df.iterrows():
|
| 142 |
+
for q in questions:
|
| 143 |
+
text = str(row[q]).strip()
|
| 144 |
+
if text:
|
| 145 |
+
sentences.append(text)
|
| 146 |
+
meta.append((row['no'], q))
|
| 147 |
+
emb = encode_sentences_with_cache(sentences, MODEL)
|
| 148 |
+
# with open(pkl_path, 'rb') as f:
|
| 149 |
+
# emb = pickle.load(f)
|
| 150 |
+
|
| 151 |
+
# 2. 自动选 k
|
| 152 |
+
n_clusters = auto_select_k(emb, max_k=max_k)
|
| 153 |
+
# n_clusters = 8
|
| 154 |
+
|
| 155 |
+
# 3. 聚类
|
| 156 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
|
| 157 |
+
labels = kmeans.labels_
|
| 158 |
+
closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
|
| 159 |
+
|
| 160 |
+
# 在聚类后,替换人工关键词
|
| 161 |
+
cluster_keywords_auto = []
|
| 162 |
+
for i in range(n_clusters):
|
| 163 |
+
kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
|
| 164 |
+
cluster_keywords_auto.append(kw)
|
| 165 |
+
|
| 166 |
+
# 4. 统计
|
| 167 |
+
stats = []
|
| 168 |
+
total = len(sentences)
|
| 169 |
+
for i in range(n_clusters):
|
| 170 |
+
cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
|
| 171 |
+
size = len(cluster_sents)
|
| 172 |
+
rep_sent = sentences[closest[i]]
|
| 173 |
+
stats.append({
|
| 174 |
+
'cluster_id': i,
|
| 175 |
+
'size': size,
|
| 176 |
+
'ratio': size / total,
|
| 177 |
+
'rep_sentence': rep_sent,
|
| 178 |
+
'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
|
| 179 |
+
})
|
| 180 |
+
|
| 181 |
+
# 5. 可视化
|
| 182 |
+
tsne = TSNE(n_components=2, random_state=42)
|
| 183 |
+
emb_2d = tsne.fit_transform(emb)
|
| 184 |
+
|
| 185 |
+
cmap = cm.get_cmap('rainbow', n_clusters)
|
| 186 |
+
cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
|
| 187 |
+
point_colors = [cluster_colors[l] for l in labels]
|
| 188 |
+
|
| 189 |
+
fig = go.Figure()
|
| 190 |
+
# 数据点
|
| 191 |
+
fig.add_trace(go.Scatter(
|
| 192 |
+
x=emb_2d[:, 0], y=emb_2d[:, 1],
|
| 193 |
+
mode='markers',
|
| 194 |
+
marker=dict(size=10, color=point_colors, opacity=0.7),
|
| 195 |
+
text=[f"聚类 {l}" for l in labels],
|
| 196 |
+
hoverinfo='text',
|
| 197 |
+
showlegend=False
|
| 198 |
+
))
|
| 199 |
+
# 聚类中心
|
| 200 |
+
center_x = emb_2d[closest, 0]
|
| 201 |
+
center_y = emb_2d[closest, 1]
|
| 202 |
+
for i, (x, y) in enumerate(zip(center_x, center_y)):
|
| 203 |
+
keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
|
| 204 |
+
fig.add_trace(go.Scatter(
|
| 205 |
+
x=[x], y=[y],
|
| 206 |
+
mode='markers+text',
|
| 207 |
+
marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
|
| 208 |
+
text=[keyword],
|
| 209 |
+
textposition="top center",
|
| 210 |
+
textfont=dict(family="SimHei", size=20, color='black'),
|
| 211 |
+
showlegend=False
|
| 212 |
+
))
|
| 213 |
+
|
| 214 |
+
fig.update_layout(
|
| 215 |
+
title="EGISInsight:学生反馈聚类洞察",
|
| 216 |
+
font=dict(family="Microsoft YaHei", size=18),
|
| 217 |
+
width=900, height=600,
|
| 218 |
+
plot_bgcolor='#F5F5F5'
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
|
| 222 |
+
b64 = base64.b64encode(img_bytes).decode()
|
| 223 |
+
|
| 224 |
+
return b64, stats
|
| 225 |
+
|
feedback_analyzer.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
生成停用词和桑基图数据
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import jieba
|
| 7 |
+
import re
|
| 8 |
+
from collections import Counter
|
| 9 |
+
import networkx as nx
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
import numpy as np
|
| 12 |
+
import warnings
|
| 13 |
+
from matplotlib.font_manager import FontProperties
|
| 14 |
+
import os
|
| 15 |
+
warnings.filterwarnings('ignore')
|
| 16 |
+
from collections import defaultdict
|
| 17 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 18 |
+
|
| 19 |
+
# domain_vocab.py
|
| 20 |
+
DOMAIN_VOCAB = [
|
| 21 |
+
# 中文通用领域词(去重 + 合并)
|
| 22 |
+
"空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
|
| 23 |
+
"数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
|
| 24 |
+
"地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
|
| 25 |
+
"核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
|
| 26 |
+
"地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
|
| 27 |
+
"参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
|
| 28 |
+
|
| 29 |
+
# 英文通用领域词(去重)
|
| 30 |
+
"ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
|
| 31 |
+
"Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
|
| 32 |
+
"spatial join", "data manager", "POI",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def generate_domain_stopwords(df, text_columns, domain_keywords):
|
| 37 |
+
"""结合领域关键词生成停用词表"""
|
| 38 |
+
# 1. 基础停用词(通用功能词)
|
| 39 |
+
common_stopwords = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么", "怎么",
|
| 40 |
+
"哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点", "一点",
|
| 41 |
+
"一些","进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "进行", "对于", "实际", "情况", "结合", "对于",
|
| 42 |
+
"学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
|
| 43 |
+
"建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
|
| 44 |
+
}
|
| 45 |
+
# 2. 领域关键词(如果有)
|
| 46 |
+
domain_words = set(domain_keywords)
|
| 47 |
+
# if domain_keywords_file and os.path.exists(domain_keywords_file):
|
| 48 |
+
# with open(domain_keywords_file, "r", encoding="utf-8") as f:
|
| 49 |
+
# domain_words = set([line.strip() for line in f if line.strip()])
|
| 50 |
+
# 3. 合并文本并预处理
|
| 51 |
+
all_text = ""
|
| 52 |
+
for col in text_columns:
|
| 53 |
+
all_text += " ".join(df[col].fillna("").astype(str))
|
| 54 |
+
|
| 55 |
+
all_text = re.sub(r"[^\w\s]", "", all_text)
|
| 56 |
+
all_text = re.sub(r"\d+", "", all_text)
|
| 57 |
+
all_text = all_text.lower() # 统一英文为小写(新增)
|
| 58 |
+
words = jieba.lcut(all_text)
|
| 59 |
+
|
| 60 |
+
# 4. 统计词频
|
| 61 |
+
word_freq = Counter(words)
|
| 62 |
+
word_freq = {word: freq for word, freq in word_freq.items() if len(word) > 1}
|
| 63 |
+
|
| 64 |
+
# 5. 生成候选停用词
|
| 65 |
+
# - 通用功能词
|
| 66 |
+
# - 高频但非领域关键词的词汇(出现频率>50%且不在领域词表中)
|
| 67 |
+
stopwords = common_stopwords.copy()
|
| 68 |
+
|
| 69 |
+
for word, freq in word_freq.items():
|
| 70 |
+
if freq > len(df) * 0.5 and word not in domain_words:
|
| 71 |
+
stopwords.add(word)
|
| 72 |
+
# 6. 保存停用词表
|
| 73 |
+
# with open(output_path, "w", encoding="utf-8") as f:
|
| 74 |
+
# for word in stopwords:
|
| 75 |
+
# f.write(word + "\n")
|
| 76 |
+
# print(f"领域定制化停用词表已生成,共{len(stopwords)}个词,保存至{output_path}")
|
| 77 |
+
|
| 78 |
+
return stopwords
|
| 79 |
+
|
| 80 |
+
def load_and_preprocess_data(df, stopwords, domain_words):
|
| 81 |
+
"""加载数据并进行预处理(优化专业词汇识别)"""
|
| 82 |
+
stopwords = set(stopwords)
|
| 83 |
+
# with open(stopwords_path, "r", encoding="utf-8") as f:
|
| 84 |
+
# for line in f:
|
| 85 |
+
# stopwords.add(line.strip())
|
| 86 |
+
|
| 87 |
+
question_types = {
|
| 88 |
+
"s1": "难点",
|
| 89 |
+
"s2": "讲解需求",
|
| 90 |
+
"s3": "操作疑惑",
|
| 91 |
+
"s4": "应用场景"
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# 定义问题因果顺序(索引越小越靠前)
|
| 95 |
+
question_hierarchy = ["s1", "s2", "s3", "s4"]
|
| 96 |
+
|
| 97 |
+
# 创建专业词汇词典(提前加载所有领域词)
|
| 98 |
+
professional_dict = {}
|
| 99 |
+
# 添加域词词汇
|
| 100 |
+
for word in domain_words:
|
| 101 |
+
jieba.add_word(word, freq=10000) # 高频率确保优先识别
|
| 102 |
+
professional_dict[word] = 1
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# 创建专业词汇的正则表达式模式
|
| 106 |
+
# 按长度降序排序,确保长词优先匹配
|
| 107 |
+
sorted_domain = sorted(domain_words, key=len, reverse=True)
|
| 108 |
+
pattern_str = "|".join(re.escape(word) for word in sorted_domain)
|
| 109 |
+
professional_pattern = re.compile(f"({pattern_str})")
|
| 110 |
+
|
| 111 |
+
def clean_text(text):
|
| 112 |
+
if not isinstance(text, str):
|
| 113 |
+
return []
|
| 114 |
+
# 第一步:基础清洗
|
| 115 |
+
text_cleaned = re.sub(r"[^\w\s]", "", text)
|
| 116 |
+
text_cleaned = re.sub(r"\d+", "", text_cleaned)
|
| 117 |
+
text_cleaned = text_cleaned.lower()
|
| 118 |
+
# 第二步:专业词汇识别和标记
|
| 119 |
+
# 找到所有专业词汇的位置
|
| 120 |
+
matches = []
|
| 121 |
+
for match in professional_pattern.finditer(text_cleaned):
|
| 122 |
+
start, end = match.span()
|
| 123 |
+
matches.append((start, end, text_cleaned[start:end]))
|
| 124 |
+
# 第三步:分割文本(保护专业词汇)
|
| 125 |
+
segments = []
|
| 126 |
+
last_end = 0
|
| 127 |
+
# 按匹配位置分割文本
|
| 128 |
+
for start, end, word in matches:
|
| 129 |
+
# 添加前面的普通文本
|
| 130 |
+
if start > last_end:
|
| 131 |
+
segments.append(text_cleaned[last_end:start])
|
| 132 |
+
|
| 133 |
+
# 添加专业词汇(作为整体)
|
| 134 |
+
segments.append(word)
|
| 135 |
+
last_end = end
|
| 136 |
+
# 添加最后一段文本
|
| 137 |
+
if last_end < len(text_cleaned):
|
| 138 |
+
segments.append(text_cleaned[last_end:])
|
| 139 |
+
# 第四步:对非专业词汇部分进行分词
|
| 140 |
+
final_words = []
|
| 141 |
+
for segment in segments:
|
| 142 |
+
if segment in professional_dict:
|
| 143 |
+
# 专业词汇直接添加
|
| 144 |
+
final_words.append(segment)
|
| 145 |
+
else:
|
| 146 |
+
# 普通文本进行分词
|
| 147 |
+
words = jieba.lcut(segment)
|
| 148 |
+
words = [w for w in words if w not in stopwords and len(w) > 1]
|
| 149 |
+
final_words.extend(words)
|
| 150 |
+
return final_words
|
| 151 |
+
|
| 152 |
+
for col in ["s1", "s2", "s3", "s4"]:
|
| 153 |
+
df[col + "_words"] = df[col].apply(clean_text)
|
| 154 |
+
|
| 155 |
+
return df, question_types, question_hierarchy
|
| 156 |
+
|
| 157 |
+
def build_sankey_data(df, question_columns, top_n=30):
|
| 158 |
+
"""
|
| 159 |
+
构建桑基图用的数据(DataFrame: source, target, value)
|
| 160 |
+
- 仅保留前 top_n 个全局高频关键词
|
| 161 |
+
"""
|
| 162 |
+
question_labels = {
|
| 163 |
+
"s1": "S1_难点",
|
| 164 |
+
"s2": "S2_讲解需求",
|
| 165 |
+
"s3": "S3_操作疑惑",
|
| 166 |
+
"s4": "S4_应用场景"
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
# 1. 统计全局关键词频率
|
| 170 |
+
all_keywords = []
|
| 171 |
+
for col in question_columns:
|
| 172 |
+
all_keywords.extend([kw for kws in df[col + "_words"] for kw in kws])
|
| 173 |
+
keyword_freq = Counter(all_keywords)
|
| 174 |
+
core_keywords = set([kw for kw, _ in keyword_freq.most_common(top_n)])
|
| 175 |
+
|
| 176 |
+
# 2. 构建桑基图原始数据
|
| 177 |
+
link_counter = Counter()
|
| 178 |
+
for _, row in df.iterrows():
|
| 179 |
+
for q in question_columns:
|
| 180 |
+
q_label = question_labels[q]
|
| 181 |
+
keywords = row[q + "_words"]
|
| 182 |
+
for kw in keywords:
|
| 183 |
+
if kw in core_keywords:
|
| 184 |
+
link_counter[(q_label, kw)] += 1
|
| 185 |
+
|
| 186 |
+
# 3. 转为 DataFrame
|
| 187 |
+
sankey_data = pd.DataFrame([
|
| 188 |
+
{"source": src, "target": tgt, "value": val}
|
| 189 |
+
for (src, tgt), val in link_counter.items()
|
| 190 |
+
])
|
| 191 |
+
# sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data.csv", index=False, encoding='utf-8-sig')
|
| 192 |
+
# sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data2.csv", index=False, encoding='utf-8-sig')
|
| 193 |
+
print("桑基图保存成功!")
|
| 194 |
+
return sankey_data
|
| 195 |
+
|
| 196 |
+
def generate_sankey_df(file_path, text_columns, domain_words, top_n=30):
|
| 197 |
+
df = pd.read_excel(file_path)
|
| 198 |
+
|
| 199 |
+
stopwords = generate_domain_stopwords(df, text_columns, domain_words)
|
| 200 |
+
df, question_types, question_hierarchy = load_and_preprocess_data(df, stopwords, domain_words)
|
| 201 |
+
sankey_data = build_sankey_data(df, text_columns, top_n)
|
| 202 |
+
return sankey_data
|
| 203 |
+
|
| 204 |
+
if __name__ == '__main__':
|
| 205 |
+
"""主函数:执行完整的分析流程"""
|
| 206 |
+
file_path = "E:\\data\\20250621Edu\\ex02.xlsx"
|
| 207 |
+
stopwords_path = "E:\\data\\20250621Edu\\stop\\stop2.txt"
|
| 208 |
+
text_columns = ["s1", "s2", "s3", "s4"]
|
| 209 |
+
top_n = 30 # 设置频数排在前30的关键词
|
| 210 |
+
sankey_data = generate_sankey_df(file_path, text_columns, DOMAIN_VOCAB, top_n)
|
main.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from analyzer import analyze_teacher_dashboard
|
| 4 |
+
from cluster_insight import cluster_and_visualize
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# ================== LLM 预留接口(未来接入通义千问)==================
|
| 8 |
+
# main.py → analyze_report 内
|
| 9 |
+
from qwen_api import call_qwen
|
| 10 |
+
|
| 11 |
+
def generate_teaching_advice(sankey_b64, cluster_stats):
|
| 12 |
+
# 1. 构造 Prompt(结构化、专业)
|
| 13 |
+
prompt = f"""
|
| 14 |
+
你是一名GIS实验教学专家,基于以下分析结果,生成教学优化方案:
|
| 15 |
+
|
| 16 |
+
【桑基图分析】
|
| 17 |
+
- 学生反馈从 s1→s4 的主要流向:核密度 → 参数设置 → 应用场景
|
| 18 |
+
- 最粗路径:核密度分析 → 搜索半径选择 → 城市规划应用
|
| 19 |
+
|
| 20 |
+
【聚类分析】
|
| 21 |
+
"""
|
| 22 |
+
for s in cluster_stats[:3]: # 取 Top 3 聚类
|
| 23 |
+
prompt += f"- 聚类 {s['cluster_id']}:{s['keyword']}({s['size']}条,占{s['ratio']:.1%})\n"
|
| 24 |
+
prompt += f" 代表句:{s['rep_sentence'][:100]}\n"
|
| 25 |
+
|
| 26 |
+
prompt += """
|
| 27 |
+
【要求】
|
| 28 |
+
1. 诊断核心教学痛点(3条)
|
| 29 |
+
2. 提出针对性优化措施(微课/演示/作业)
|
| 30 |
+
3. 设计 1 个 2 分钟微课脚本(标题+3步演示)
|
| 31 |
+
4. 建议 1 个课后作业(验证学生掌握)
|
| 32 |
+
|
| 33 |
+
【输出格式】
|
| 34 |
+
# 教学优化方案
|
| 35 |
+
## 1. 核心痛点
|
| 36 |
+
## 2. 优化措施
|
| 37 |
+
## 3. 微课脚本
|
| 38 |
+
## 4. 课后作业
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
# 2. 调用通义千问
|
| 42 |
+
advice = call_qwen(prompt)
|
| 43 |
+
return f"<pre style='background:#f8f9fa; padding:15px; border-radius:8px; white-space: pre-wrap;'>{advice}</pre>"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ================== Gradio 界面 ==================
|
| 47 |
+
def analyze_report(file):
|
| 48 |
+
if not file:
|
| 49 |
+
return "请上传 Excel 文件", None
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
# 1. 分析 → 桑基图
|
| 53 |
+
sankey_b64 = analyze_teacher_dashboard(excel_path = file.name)
|
| 54 |
+
# 2. 聚类图
|
| 55 |
+
cluster_b64, cluster_stats = cluster_and_visualize( excel_path=file.name )
|
| 56 |
+
# print(cluster_b64)
|
| 57 |
+
# print(cluster_stats)
|
| 58 |
+
# 3. 生成教学建议
|
| 59 |
+
advice = generate_teaching_advice(sankey_b64, cluster_stats)
|
| 60 |
+
|
| 61 |
+
# 4. 聚类统计表格
|
| 62 |
+
stats_table = """
|
| 63 |
+
<h3>聚类主题统计</h3>
|
| 64 |
+
<table border="1" style="width:100%; border-collapse: collapse; text-align:center; font-size:14px;">
|
| 65 |
+
<tr style="background:#f0f0f0;">
|
| 66 |
+
<th>聚类</th><th>主题关键词</th><th>反馈数</th><th>占比</th><th>代表句</th>
|
| 67 |
+
</tr>
|
| 68 |
+
"""
|
| 69 |
+
for s in cluster_stats:
|
| 70 |
+
stats_table += f"""
|
| 71 |
+
<tr>
|
| 72 |
+
<td>{s['cluster_id']}</td>
|
| 73 |
+
<td><strong>{s['keyword']}</strong></td>
|
| 74 |
+
<td>{s['size']}</td>
|
| 75 |
+
<td>{s['ratio']:.1%}</td>
|
| 76 |
+
<td style="text-align:left; max-width:300px;">{s['rep_sentence'][:60]}...</td>
|
| 77 |
+
</tr>
|
| 78 |
+
"""
|
| 79 |
+
stats_table += "</table>"
|
| 80 |
+
|
| 81 |
+
# 5. 最终 HTML 输出
|
| 82 |
+
html = f"""
|
| 83 |
+
<div style="font-family: 'Microsoft YaHei', sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px;">
|
| 84 |
+
<h1 style="text-align:center; color:#1e88e5;">EGISInsight</h1>
|
| 85 |
+
<p style="text-align:center; color:#555; font-size:16px;">
|
| 86 |
+
GIS 教学智能体 · 循证教学优化
|
| 87 |
+
</p>
|
| 88 |
+
<hr style="border: 1px solid #eee; margin: 30px 0;">
|
| 89 |
+
|
| 90 |
+
<h2 style="color:#1976d2;">1. 实验报告反馈</h2>
|
| 91 |
+
<img src="data:image/png;base64,{sankey_b64}"
|
| 92 |
+
style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
|
| 93 |
+
|
| 94 |
+
<h2 style="color:#388e3c; margin-top:40px;">2. 学生反馈聚类</h2>
|
| 95 |
+
<img src="data:image/png;base64,{cluster_b64}"
|
| 96 |
+
style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
|
| 97 |
+
|
| 98 |
+
<div style="margin-top:30px;">
|
| 99 |
+
{stats_table}
|
| 100 |
+
</div>
|
| 101 |
+
|
| 102 |
+
<div style="margin-top:30px; padding:20px; background:#f8f9fa; border-radius:8px; text-align:left;">
|
| 103 |
+
{advice}
|
| 104 |
+
</div>
|
| 105 |
+
|
| 106 |
+
<p style="text-align:center; color:#999; font-size:13px; margin-top:40px;">
|
| 107 |
+
EGISInsight © 2025 | 从数据到教学内容改革
|
| 108 |
+
</p>
|
| 109 |
+
</div>
|
| 110 |
+
"""
|
| 111 |
+
return html, None
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
return f"分析失败:{str(e)}", None
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ================== Gradio UI ==================
|
| 118 |
+
with gr.Blocks(title="教学智能体 · 实验报告分析") as demo:
|
| 119 |
+
gr.Markdown("# GIS实验报告智能分析系统")
|
| 120 |
+
gr.Markdown("**上传融合后的学生反馈 Excel → 一键生成教学决策图**")
|
| 121 |
+
|
| 122 |
+
with gr.Row():
|
| 123 |
+
file_input = gr.File(
|
| 124 |
+
label="上传 ex02.xlsx(含 s1-s4 列)",
|
| 125 |
+
file_types=[".xlsx"]
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
with gr.Row():
|
| 129 |
+
output = gr.HTML(label="分析结果")
|
| 130 |
+
|
| 131 |
+
file_input.change(analyze_report, inputs=file_input, outputs=output)
|
| 132 |
+
|
| 133 |
+
gr.Markdown("---")
|
| 134 |
+
gr.Markdown("**后续将接入通义千问大模型,自动生成教案、微课脚本、作业设计**")
|
| 135 |
+
|
| 136 |
+
# ================== 启动 ==================
|
| 137 |
+
if __name__ == "__main__":
|
| 138 |
+
demo.launch(
|
| 139 |
+
server_name="0.0.0.0",
|
| 140 |
+
server_port=7860,
|
| 141 |
+
# share=False # 改 True 可生成公网链接
|
| 142 |
+
share=True # 改 True 可生成公网链接
|
| 143 |
+
)
|
qwen_api.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# qwen_api.py
|
| 3 |
+
import requests
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
QWEN_API_KEY = "sk-6bb5d1eb5eab468ba4e0b38451526fae" # sk-6bb5d1eb5eab468ba4e0b38451526fae
|
| 7 |
+
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"
|
| 8 |
+
|
| 9 |
+
def call_qwen(prompt: str) -> str:
|
| 10 |
+
headers = {
|
| 11 |
+
"Authorization": f"Bearer {QWEN_API_KEY}",
|
| 12 |
+
"Content-Type": "application/json"
|
| 13 |
+
}
|
| 14 |
+
payload = {
|
| 15 |
+
"model": "qwen-plus",
|
| 16 |
+
"input": {"prompt": prompt},
|
| 17 |
+
"parameters": {
|
| 18 |
+
"result_format": "text",
|
| 19 |
+
"temperature": 0.7,
|
| 20 |
+
"top_p": 0.8
|
| 21 |
+
}
|
| 22 |
+
}
|
| 23 |
+
try:
|
| 24 |
+
resp = requests.post(QWEN_API_URL, headers=headers, json=payload, timeout=30)
|
| 25 |
+
result = resp.json()
|
| 26 |
+
return result['output']['text']
|
| 27 |
+
except Exception as e:
|
| 28 |
+
return f"【大模型调用失败】{str(e)}"
|
| 29 |
+
|
| 30 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.26.0
|
| 2 |
+
plotly
|
| 3 |
+
kaleido
|
| 4 |
+
pandas
|
| 5 |
+
openpyxl
|
| 6 |
+
scikit-learn
|
| 7 |
+
jieba
|
| 8 |
+
sentence-transformers
|
| 9 |
+
requests
|
| 10 |
+
torch
|
| 11 |
+
transformers
|
sankey_plot.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sankey_plot.py
|
| 2 |
+
import plotly.graph_objects as go
|
| 3 |
+
import matplotlib.cm as cm
|
| 4 |
+
import matplotlib.colors as mcolors
|
| 5 |
+
import base64
|
| 6 |
+
import pandas as pd
|
| 7 |
+
def plot_sankey_from_df(sankey_df: pd.DataFrame, title="问题 → 关键词共现") -> str:
|
| 8 |
+
if sankey_df.empty:
|
| 9 |
+
return "无数据"
|
| 10 |
+
|
| 11 |
+
# 过滤 top targets
|
| 12 |
+
top_targets = sankey_df.groupby('target')['value'].sum().sort_values(ascending=False).head(15).index
|
| 13 |
+
df = sankey_df[sankey_df['target'].isin(top_targets)].copy()
|
| 14 |
+
|
| 15 |
+
# 节点顺序
|
| 16 |
+
sources = ['S4_应用场景', 'S3_操作疑惑', 'S2_讲解需求', 'S1_难点']
|
| 17 |
+
sources = [s for s in sources if s in df['source'].unique()]
|
| 18 |
+
targets = top_targets.tolist()
|
| 19 |
+
all_nodes = sources + targets
|
| 20 |
+
node_index = {n: i for i, n in enumerate(all_nodes)}
|
| 21 |
+
|
| 22 |
+
# 颜色
|
| 23 |
+
source_color_map = {
|
| 24 |
+
'S1_难点': '#345DA7', 'S2_讲解需求': '#3B8AC4',
|
| 25 |
+
'S3_操作疑惑': '#4BB4DE', 'S4_应用场景': '#EFDBCB'
|
| 26 |
+
}
|
| 27 |
+
cmap = cm.get_cmap('Set3', len(targets))
|
| 28 |
+
target_colors = [mcolors.to_hex(cmap(i)) for i in range(len(targets))]
|
| 29 |
+
target_color_map = dict(zip(targets, target_colors))
|
| 30 |
+
|
| 31 |
+
node_colors = [source_color_map.get(n, target_color_map.get(n, '#gray')) for n in all_nodes]
|
| 32 |
+
link_colors = [target_color_map.get(t, '#gray') for t in df['target']]
|
| 33 |
+
|
| 34 |
+
fig = go.Figure(data=[go.Sankey(
|
| 35 |
+
node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5),
|
| 36 |
+
label=all_nodes, color=node_colors),
|
| 37 |
+
link=dict(
|
| 38 |
+
source=df['source'].map(node_index),
|
| 39 |
+
target=df['target'].map(node_index),
|
| 40 |
+
value=df['value'],
|
| 41 |
+
color=link_colors
|
| 42 |
+
)
|
| 43 |
+
)])
|
| 44 |
+
|
| 45 |
+
fig.update_layout(title_text=title, font=dict(family="Microsoft YaHei", size=18), width=900, height=600,
|
| 46 |
+
margin=dict(l=20, r=20, t=60, b=20))
|
| 47 |
+
# === 5. 导出高清 PNG(关键!)===
|
| 48 |
+
img_bytes = fig.to_image(
|
| 49 |
+
format="png",
|
| 50 |
+
width=900,
|
| 51 |
+
height=600,
|
| 52 |
+
scale=2 # 2倍 DPI → 超清!
|
| 53 |
+
)
|
| 54 |
+
return base64.b64encode(img_bytes).decode()
|