wxy01giser commited on
Commit
a215fac
·
verified ·
1 Parent(s): 6ed25c3

Upload 8 files

Browse files
Files changed (8) hide show
  1. analyzer.py +27 -0
  2. app.gradio.py +143 -0
  3. cluster_insight.py +225 -0
  4. feedback_analyzer.py +210 -0
  5. main.py +143 -0
  6. qwen_api.py +30 -0
  7. requirements.txt +11 -0
  8. sankey_plot.py +54 -0
analyzer.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # analyzer.py
2
+ from feedback_analyzer import generate_sankey_df, DOMAIN_VOCAB
3
+ from sankey_plot import plot_sankey_from_df
4
+
5
+ def analyze_teacher_dashboard(excel_path: str) -> str:
6
+ """
7
+ 输入:融合后的 Excel 路径
8
+ 输出:桑基图 base64 字符串(可直接嵌入 HTML)
9
+ 全内存流,零中间文件,零冗余返回
10
+ """
11
+ # 1. 内存生成停用词 + 桑基数据(自动缓存)
12
+ sankey_df = generate_sankey_df(
13
+ file_path=excel_path,
14
+ text_columns=["s1", "s2", "s3", "s4"],
15
+ domain_words=DOMAIN_VOCAB,
16
+ top_n=30
17
+ )
18
+
19
+ # 2. 内存绘图 → base64
20
+ sankey_b64 = plot_sankey_from_df(sankey_df, title="GIS实践教学改革方向捕捉")
21
+
22
+ # 3. 只返回最终结果
23
+ return sankey_b64
24
+
25
+ if __name__ == '__main__':
26
+ sankey_b64 = analyze_teacher_dashboard(excel_path = "E:\\data\\20250621Edu\\ex02.xlsx")
27
+ print()
app.gradio.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import gradio as gr
3
+ from analyzer import analyze_teacher_dashboard
4
+ from cluster_insight import cluster_and_visualize
5
+
6
+
7
+ # ================== LLM 预留接口(未来接入通义千问)==================
8
+ # main.py → analyze_report 内
9
+ from qwen_api import call_qwen
10
+
11
+ def generate_teaching_advice(sankey_b64, cluster_stats):
12
+ # 1. 构造 Prompt(结构化、专业)
13
+ prompt = f"""
14
+ 你是一名GIS实验教学专家,基于以下分析结果,生成教学优化方案:
15
+
16
+ 【桑基图分析】
17
+ - 学生反馈从 s1→s4 的主要流向:核密度 → 参数设置 → 应用场景
18
+ - 最粗路径:核密度分析 → 搜索半径选择 → 城市规划应用
19
+
20
+ 【聚类分析】
21
+ """
22
+ for s in cluster_stats[:3]: # 取 Top 3 聚类
23
+ prompt += f"- 聚类 {s['cluster_id']}:{s['keyword']}({s['size']}条,占{s['ratio']:.1%})\n"
24
+ prompt += f" 代表句:{s['rep_sentence'][:100]}\n"
25
+
26
+ prompt += """
27
+ 【要求】
28
+ 1. 诊断核心教学痛点(3条)
29
+ 2. 提出针对性优化措施(微课/演示/作业)
30
+ 3. 设计 1 个 2 分钟微课脚本(标题+3步演示)
31
+ 4. 建议 1 个课后作业(验证学生掌握)
32
+
33
+ 【输出格式】
34
+ # 教学优化方案
35
+ ## 1. 核心痛点
36
+ ## 2. 优化措施
37
+ ## 3. 微课脚本
38
+ ## 4. 课后作业
39
+ """
40
+
41
+ # 2. 调用通义千问
42
+ advice = call_qwen(prompt)
43
+ return f"<pre style='background:#f8f9fa; padding:15px; border-radius:8px; white-space: pre-wrap;'>{advice}</pre>"
44
+
45
+
46
+ # ================== Gradio 界面 ==================
47
+ def analyze_report(file):
48
+ if not file:
49
+ return "请上传 Excel 文件", None
50
+
51
+ try:
52
+ # 1. 分析 → 桑基图
53
+ sankey_b64 = analyze_teacher_dashboard(excel_path = file.name)
54
+ # 2. 聚类图
55
+ cluster_b64, cluster_stats = cluster_and_visualize( excel_path=file.name )
56
+ # print(cluster_b64)
57
+ # print(cluster_stats)
58
+ # 3. 生成教学建议
59
+ advice = generate_teaching_advice(sankey_b64, cluster_stats)
60
+
61
+ # 4. 聚类统计表格
62
+ stats_table = """
63
+ <h3>聚类主题统计</h3>
64
+ <table border="1" style="width:100%; border-collapse: collapse; text-align:center; font-size:14px;">
65
+ <tr style="background:#f0f0f0;">
66
+ <th>聚类</th><th>主题关键词</th><th>反馈数</th><th>占比</th><th>代表句</th>
67
+ </tr>
68
+ """
69
+ for s in cluster_stats:
70
+ stats_table += f"""
71
+ <tr>
72
+ <td>{s['cluster_id']}</td>
73
+ <td><strong>{s['keyword']}</strong></td>
74
+ <td>{s['size']}</td>
75
+ <td>{s['ratio']:.1%}</td>
76
+ <td style="text-align:left; max-width:300px;">{s['rep_sentence'][:60]}...</td>
77
+ </tr>
78
+ """
79
+ stats_table += "</table>"
80
+
81
+ # 5. 最终 HTML 输出
82
+ html = f"""
83
+ <div style="font-family: 'Microsoft YaHei', sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px;">
84
+ <h1 style="text-align:center; color:#1e88e5;">EGISInsight</h1>
85
+ <p style="text-align:center; color:#555; font-size:16px;">
86
+ GIS 教学智能体 · 循证教学优化
87
+ </p>
88
+ <hr style="border: 1px solid #eee; margin: 30px 0;">
89
+
90
+ <h2 style="color:#1976d2;">1. 实验报告反馈</h2>
91
+ <img src="data:image/png;base64,{sankey_b64}"
92
+ style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
93
+
94
+ <h2 style="color:#388e3c; margin-top:40px;">2. 学生反馈聚类</h2>
95
+ <img src="data:image/png;base64,{cluster_b64}"
96
+ style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
97
+
98
+ <div style="margin-top:30px;">
99
+ {stats_table}
100
+ </div>
101
+
102
+ <div style="margin-top:30px; padding:20px; background:#f8f9fa; border-radius:8px; text-align:left;">
103
+ {advice}
104
+ </div>
105
+
106
+ <p style="text-align:center; color:#999; font-size:13px; margin-top:40px;">
107
+ EGISInsight © 2025 | 从数据到教学内容改革
108
+ </p>
109
+ </div>
110
+ """
111
+ return html, None
112
+
113
+ except Exception as e:
114
+ return f"分析失败:{str(e)}", None
115
+
116
+
117
+ # ================== Gradio UI ==================
118
+ with gr.Blocks(title="教学智能体 · 实验报告分析") as demo:
119
+ gr.Markdown("# GIS实验报告智能分析系统")
120
+ gr.Markdown("**上传融合后的学生反馈 Excel → 一键生成教学决策图**")
121
+
122
+ with gr.Row():
123
+ file_input = gr.File(
124
+ label="上传 ex02.xlsx(含 s1-s4 列)",
125
+ file_types=[".xlsx"]
126
+ )
127
+
128
+ with gr.Row():
129
+ output = gr.HTML(label="分析结果")
130
+
131
+ file_input.change(analyze_report, inputs=file_input, outputs=output)
132
+
133
+ gr.Markdown("---")
134
+ gr.Markdown("**后续将接入通义千问大模型,自动生成教案、微课脚本、作业设计**")
135
+
136
+ # ================== 启动 ==================
137
+ if __name__ == "__main__":
138
+ demo.launch(
139
+ server_name="0.0.0.0",
140
+ server_port=7860,
141
+ # share=False # 改 True 可生成公网链接
142
+ share=True # 改 True 可生成公网链接
143
+ )
cluster_insight.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ 基于向量聚类方式的可视化模块
3
+ '''
4
+ # cluster_insight.py
5
+ import pandas as pd
6
+ import numpy as np
7
+ from sklearn.cluster import KMeans
8
+ from sklearn.metrics import silhouette_score
9
+ from sklearn.manifold import TSNE
10
+ import plotly.graph_objects as go
11
+ import matplotlib.cm as cm
12
+ import matplotlib.colors as mcolors
13
+ from wordcloud import WordCloud
14
+ import matplotlib.pyplot as plt
15
+ import base64
16
+ from io import BytesIO
17
+ from sentence_transformers import SentenceTransformer, util
18
+ import os
19
+ import pickle
20
+ # cluster_insight.py → 新增函数
21
+ from sklearn.feature_extraction.text import TfidfVectorizer
22
+ import jieba
23
+
24
+
25
+ def extract_cluster_keywords_auto(sentences, labels, cluster_id, top_n=3):
26
+ """
27
+ 自动提取聚类关键词
28
+ :param sentences: 所有句子
29
+ :param labels: 聚类标签
30
+ :param cluster_id: 当前聚类
31
+ :param top_n: 提取前 n 个词
32
+ :return: 关键词字符串
33
+ """
34
+ # 1. 提取该聚类所有句子
35
+ cluster_texts = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]
36
+ if not cluster_texts:
37
+ return "无数据"
38
+
39
+ # 2. 分词(保护领域词)
40
+ DOMAIN_SET = {
41
+ # 中文通用领域词(去重 + 合并)
42
+ "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
43
+ "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
44
+ "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
45
+ "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
46
+ "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
47
+ "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
48
+
49
+ # 英文通用领域词(去重)
50
+ "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
51
+ "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
52
+ "spatial join", "data manager", "POI",
53
+ }
54
+ for word in DOMAIN_SET:
55
+ jieba.add_word(word, freq=10000)
56
+ # 1. 基础停用词(通用功能词)
57
+ STOPWORDS = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么",
58
+ "怎么", "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点",
59
+ "一点", "一些", "进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "对于", "进行", "实际", "情况", "结合",
60
+ "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
61
+ "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
62
+ }
63
+ def tokenize(text):
64
+ words = jieba.lcut(text)
65
+ return [
66
+ w for w in words
67
+ if len(w) > 1 and w not in STOPWORDS and not w.isdigit()
68
+ ]
69
+
70
+ tokenized = [" ".join(tokenize(text)) for text in cluster_texts]
71
+
72
+ # 3. TF-IDF 提取关键词
73
+ vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
74
+ try:
75
+ tfidf_matrix = vectorizer.fit_transform(tokenized)
76
+ feature_names = vectorizer.get_feature_names_out()
77
+
78
+ # 取平均 TF-IDF 最高的词
79
+ mean_tfidf = tfidf_matrix.mean(axis=0).A1
80
+ top_indices = mean_tfidf.argsort()[-top_n:][::-1]
81
+ keywords = [feature_names[i] for i in top_indices]
82
+ return " | ".join(keywords)
83
+ except:
84
+ return "关键词提取失败"
85
+
86
+
87
+ model_path = r'.\sbert\models--shibing624--text2vec-base-chinese\snapshots\183bb99aa7af74355fb58d16edf8c13ae7c5433e'
88
+ # 从本地加载模型
89
+ MODEL = SentenceTransformer(model_path)
90
+
91
+ def encode_sentences_with_cache(sentences, model): #, cache_path='sentence_vectors.pkl'
92
+ """
93
+ 使用 BERT 编码句子,并支持本地缓存
94
+ """
95
+ # if os.path.exists(cache_path):
96
+ # print(f"已找到缓存文件:{cache_path},正在加载向量...")
97
+ # with open(cache_path, 'rb') as f:
98
+ # sentence_vectors = pickle.load(f)
99
+ # else:
100
+ # print("未找到缓存,开始编码...")
101
+ sentence_vectors = model.encode(
102
+ sentences,
103
+ batch_size=16,
104
+ show_progress_bar=True,
105
+ convert_to_tensor=False
106
+ )
107
+ # print(f"编码完成,保存到:{cache_path}")
108
+ # with open(cache_path, 'wb') as f:
109
+ # pickle.dump(sentence_vectors, f)
110
+
111
+ return sentence_vectors
112
+
113
+
114
+ def auto_select_k(embeddings, max_k=10):
115
+ """自动选择最佳聚类数(轮廓系数最高)"""
116
+ sil_scores = []
117
+ k_range = range(2, min(max_k + 1, len(embeddings) // 2))
118
+ for k in k_range:
119
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
120
+ labels = kmeans.fit_predict(embeddings)
121
+ sil_scores.append(silhouette_score(embeddings, labels))
122
+
123
+ best_k = k_range[np.argmax(sil_scores)]
124
+ print(f"自动选择最佳聚类数: k = {best_k} (轮廓系数: {max(sil_scores):.3f})")
125
+ return best_k
126
+
127
+
128
+ def cluster_and_visualize(
129
+ excel_path: str,
130
+ questions=['s1', 's2', 's3', 's4'],
131
+ max_k=15
132
+ ):
133
+ """
134
+ 输入:Excel + 句向量 pkl
135
+ 输出:(聚类图 base64, 统计信息 dict)
136
+ """
137
+ # 1. 加载数据
138
+ df = pd.read_excel(excel_path)
139
+ sentences = []
140
+ meta = []
141
+ for idx, row in df.iterrows():
142
+ for q in questions:
143
+ text = str(row[q]).strip()
144
+ if text:
145
+ sentences.append(text)
146
+ meta.append((row['no'], q))
147
+ emb = encode_sentences_with_cache(sentences, MODEL)
148
+ # with open(pkl_path, 'rb') as f:
149
+ # emb = pickle.load(f)
150
+
151
+ # 2. 自动选 k
152
+ n_clusters = auto_select_k(emb, max_k=max_k)
153
+ # n_clusters = 8
154
+
155
+ # 3. 聚类
156
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(emb)
157
+ labels = kmeans.labels_
158
+ closest = np.argmin(np.linalg.norm(emb - kmeans.cluster_centers_[:, np.newaxis], axis=2), axis=1)
159
+
160
+ # 在聚类后,替换人工关键词
161
+ cluster_keywords_auto = []
162
+ for i in range(n_clusters):
163
+ kw = extract_cluster_keywords_auto(sentences, labels, i, top_n=5)
164
+ cluster_keywords_auto.append(kw)
165
+
166
+ # 4. 统计
167
+ stats = []
168
+ total = len(sentences)
169
+ for i in range(n_clusters):
170
+ cluster_sents = [s for s, l in zip(sentences, labels) if l == i]
171
+ size = len(cluster_sents)
172
+ rep_sent = sentences[closest[i]]
173
+ stats.append({
174
+ 'cluster_id': i,
175
+ 'size': size,
176
+ 'ratio': size / total,
177
+ 'rep_sentence': rep_sent,
178
+ 'keyword': cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
179
+ })
180
+
181
+ # 5. 可视化
182
+ tsne = TSNE(n_components=2, random_state=42)
183
+ emb_2d = tsne.fit_transform(emb)
184
+
185
+ cmap = cm.get_cmap('rainbow', n_clusters)
186
+ cluster_colors = [mcolors.rgb2hex(cmap(i)[:3]) for i in range(n_clusters)]
187
+ point_colors = [cluster_colors[l] for l in labels]
188
+
189
+ fig = go.Figure()
190
+ # 数据点
191
+ fig.add_trace(go.Scatter(
192
+ x=emb_2d[:, 0], y=emb_2d[:, 1],
193
+ mode='markers',
194
+ marker=dict(size=10, color=point_colors, opacity=0.7),
195
+ text=[f"聚类 {l}" for l in labels],
196
+ hoverinfo='text',
197
+ showlegend=False
198
+ ))
199
+ # 聚类中心
200
+ center_x = emb_2d[closest, 0]
201
+ center_y = emb_2d[closest, 1]
202
+ for i, (x, y) in enumerate(zip(center_x, center_y)):
203
+ keyword = cluster_keywords_auto[i] if cluster_keywords_auto else f"聚类 {i}"
204
+ fig.add_trace(go.Scatter(
205
+ x=[x], y=[y],
206
+ mode='markers+text',
207
+ marker=dict(size=30, color=cluster_colors[i], line=dict(width=2, color='black')),
208
+ text=[keyword],
209
+ textposition="top center",
210
+ textfont=dict(family="SimHei", size=20, color='black'),
211
+ showlegend=False
212
+ ))
213
+
214
+ fig.update_layout(
215
+ title="EGISInsight:学生反馈聚类洞察",
216
+ font=dict(family="Microsoft YaHei", size=18),
217
+ width=900, height=600,
218
+ plot_bgcolor='#F5F5F5'
219
+ )
220
+
221
+ img_bytes = fig.to_image(format="png", width=900, height=600, scale=2)
222
+ b64 = base64.b64encode(img_bytes).decode()
223
+
224
+ return b64, stats
225
+
feedback_analyzer.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 生成停用词和桑基图数据
3
+ """
4
+
5
+ import pandas as pd
6
+ import jieba
7
+ import re
8
+ from collections import Counter
9
+ import networkx as nx
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+ import warnings
13
+ from matplotlib.font_manager import FontProperties
14
+ import os
15
+ warnings.filterwarnings('ignore')
16
+ from collections import defaultdict
17
+ from sklearn.feature_extraction.text import CountVectorizer
18
+
19
+ # domain_vocab.py
20
+ DOMAIN_VOCAB = [
21
+ # 中文通用领域词(去重 + 合并)
22
+ "空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
23
+ "数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
24
+ "地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
25
+ "核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
26
+ "地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
27
+ "参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
28
+
29
+ # 英文通用领域词(去重)
30
+ "ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
31
+ "Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
32
+ "spatial join", "data manager", "POI",
33
+ ]
34
+
35
+
36
+ def generate_domain_stopwords(df, text_columns, domain_keywords):
37
+ """结合领域关键词生成停用词表"""
38
+ # 1. 基础停用词(通用功能词)
39
+ common_stopwords = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么", "怎么",
40
+ "哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点", "一点",
41
+ "一些","进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "进行", "对于", "实际", "情况", "结合", "对于",
42
+ "学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
43
+ "建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
44
+ }
45
+ # 2. 领域关键词(如果有)
46
+ domain_words = set(domain_keywords)
47
+ # if domain_keywords_file and os.path.exists(domain_keywords_file):
48
+ # with open(domain_keywords_file, "r", encoding="utf-8") as f:
49
+ # domain_words = set([line.strip() for line in f if line.strip()])
50
+ # 3. 合并文本并预处理
51
+ all_text = ""
52
+ for col in text_columns:
53
+ all_text += " ".join(df[col].fillna("").astype(str))
54
+
55
+ all_text = re.sub(r"[^\w\s]", "", all_text)
56
+ all_text = re.sub(r"\d+", "", all_text)
57
+ all_text = all_text.lower() # 统一英文为小写(新增)
58
+ words = jieba.lcut(all_text)
59
+
60
+ # 4. 统计词频
61
+ word_freq = Counter(words)
62
+ word_freq = {word: freq for word, freq in word_freq.items() if len(word) > 1}
63
+
64
+ # 5. 生成候选停用词
65
+ # - 通用功能词
66
+ # - 高频但非领域关键词的词汇(出现频率>50%且不在领域词表中)
67
+ stopwords = common_stopwords.copy()
68
+
69
+ for word, freq in word_freq.items():
70
+ if freq > len(df) * 0.5 and word not in domain_words:
71
+ stopwords.add(word)
72
+ # 6. 保存停用词表
73
+ # with open(output_path, "w", encoding="utf-8") as f:
74
+ # for word in stopwords:
75
+ # f.write(word + "\n")
76
+ # print(f"领域定制化停用词表已生成,共{len(stopwords)}个词,保存至{output_path}")
77
+
78
+ return stopwords
79
+
80
+ def load_and_preprocess_data(df, stopwords, domain_words):
81
+ """加载数据并进行预处理(优化专业词汇识别)"""
82
+ stopwords = set(stopwords)
83
+ # with open(stopwords_path, "r", encoding="utf-8") as f:
84
+ # for line in f:
85
+ # stopwords.add(line.strip())
86
+
87
+ question_types = {
88
+ "s1": "难点",
89
+ "s2": "讲解需求",
90
+ "s3": "操作疑惑",
91
+ "s4": "应用场景"
92
+ }
93
+
94
+ # 定义问题因果顺序(索引越小越靠前)
95
+ question_hierarchy = ["s1", "s2", "s3", "s4"]
96
+
97
+ # 创建专业词汇词典(提前加载所有领域词)
98
+ professional_dict = {}
99
+ # 添加域词词汇
100
+ for word in domain_words:
101
+ jieba.add_word(word, freq=10000) # 高频率确保优先识别
102
+ professional_dict[word] = 1
103
+
104
+
105
+ # 创建专业词汇的正则表达式模式
106
+ # 按长度降序排序,确保长词优先匹配
107
+ sorted_domain = sorted(domain_words, key=len, reverse=True)
108
+ pattern_str = "|".join(re.escape(word) for word in sorted_domain)
109
+ professional_pattern = re.compile(f"({pattern_str})")
110
+
111
+ def clean_text(text):
112
+ if not isinstance(text, str):
113
+ return []
114
+ # 第一步:基础清洗
115
+ text_cleaned = re.sub(r"[^\w\s]", "", text)
116
+ text_cleaned = re.sub(r"\d+", "", text_cleaned)
117
+ text_cleaned = text_cleaned.lower()
118
+ # 第二步:专业词汇识别和标记
119
+ # 找到所有专业词汇的位置
120
+ matches = []
121
+ for match in professional_pattern.finditer(text_cleaned):
122
+ start, end = match.span()
123
+ matches.append((start, end, text_cleaned[start:end]))
124
+ # 第三步:分割文本(保护专业词汇)
125
+ segments = []
126
+ last_end = 0
127
+ # 按匹配位置分割文本
128
+ for start, end, word in matches:
129
+ # 添加前面的普通文本
130
+ if start > last_end:
131
+ segments.append(text_cleaned[last_end:start])
132
+
133
+ # 添加专业词汇(作为整体)
134
+ segments.append(word)
135
+ last_end = end
136
+ # 添加最后一段文本
137
+ if last_end < len(text_cleaned):
138
+ segments.append(text_cleaned[last_end:])
139
+ # 第四步:对非专业词汇部分进行分词
140
+ final_words = []
141
+ for segment in segments:
142
+ if segment in professional_dict:
143
+ # 专业词汇直接添加
144
+ final_words.append(segment)
145
+ else:
146
+ # 普通文本进行分词
147
+ words = jieba.lcut(segment)
148
+ words = [w for w in words if w not in stopwords and len(w) > 1]
149
+ final_words.extend(words)
150
+ return final_words
151
+
152
+ for col in ["s1", "s2", "s3", "s4"]:
153
+ df[col + "_words"] = df[col].apply(clean_text)
154
+
155
+ return df, question_types, question_hierarchy
156
+
157
+ def build_sankey_data(df, question_columns, top_n=30):
158
+ """
159
+ 构建桑基图用的数据(DataFrame: source, target, value)
160
+ - 仅保留前 top_n 个全局高频关键词
161
+ """
162
+ question_labels = {
163
+ "s1": "S1_难点",
164
+ "s2": "S2_讲解需求",
165
+ "s3": "S3_操作疑惑",
166
+ "s4": "S4_应用场景"
167
+ }
168
+
169
+ # 1. 统计全局关键词频率
170
+ all_keywords = []
171
+ for col in question_columns:
172
+ all_keywords.extend([kw for kws in df[col + "_words"] for kw in kws])
173
+ keyword_freq = Counter(all_keywords)
174
+ core_keywords = set([kw for kw, _ in keyword_freq.most_common(top_n)])
175
+
176
+ # 2. 构建桑基图原始数据
177
+ link_counter = Counter()
178
+ for _, row in df.iterrows():
179
+ for q in question_columns:
180
+ q_label = question_labels[q]
181
+ keywords = row[q + "_words"]
182
+ for kw in keywords:
183
+ if kw in core_keywords:
184
+ link_counter[(q_label, kw)] += 1
185
+
186
+ # 3. 转为 DataFrame
187
+ sankey_data = pd.DataFrame([
188
+ {"source": src, "target": tgt, "value": val}
189
+ for (src, tgt), val in link_counter.items()
190
+ ])
191
+ # sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data.csv", index=False, encoding='utf-8-sig')
192
+ # sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data2.csv", index=False, encoding='utf-8-sig')
193
+ print("桑基图保存成功!")
194
+ return sankey_data
195
+
196
+ def generate_sankey_df(file_path, text_columns, domain_words, top_n=30):
197
+ df = pd.read_excel(file_path)
198
+
199
+ stopwords = generate_domain_stopwords(df, text_columns, domain_words)
200
+ df, question_types, question_hierarchy = load_and_preprocess_data(df, stopwords, domain_words)
201
+ sankey_data = build_sankey_data(df, text_columns, top_n)
202
+ return sankey_data
203
+
204
+ if __name__ == '__main__':
205
+ """主函数:执行完整的分析流程"""
206
+ file_path = "E:\\data\\20250621Edu\\ex02.xlsx"
207
+ stopwords_path = "E:\\data\\20250621Edu\\stop\\stop2.txt"
208
+ text_columns = ["s1", "s2", "s3", "s4"]
209
+ top_n = 30 # 设置频数排在前30的关键词
210
+ sankey_data = generate_sankey_df(file_path, text_columns, DOMAIN_VOCAB, top_n)
main.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import gradio as gr
3
+ from analyzer import analyze_teacher_dashboard
4
+ from cluster_insight import cluster_and_visualize
5
+
6
+
7
+ # ================== LLM 预留接口(未来接入通义千问)==================
8
+ # main.py → analyze_report 内
9
+ from qwen_api import call_qwen
10
+
11
+ def generate_teaching_advice(sankey_b64, cluster_stats):
12
+ # 1. 构造 Prompt(结构化、专业)
13
+ prompt = f"""
14
+ 你是一名GIS实验教学专家,基于以下分析结果,生成教学优化方案:
15
+
16
+ 【桑基图分析】
17
+ - 学生反馈从 s1→s4 的主要流向:核密度 → 参数设置 → 应用场景
18
+ - 最粗路径:核密度分析 → 搜索半径选择 → 城市规划应用
19
+
20
+ 【聚类分析】
21
+ """
22
+ for s in cluster_stats[:3]: # 取 Top 3 聚类
23
+ prompt += f"- 聚类 {s['cluster_id']}:{s['keyword']}({s['size']}条,占{s['ratio']:.1%})\n"
24
+ prompt += f" 代表句:{s['rep_sentence'][:100]}\n"
25
+
26
+ prompt += """
27
+ 【要求】
28
+ 1. 诊断核心教学痛点(3条)
29
+ 2. 提出针对性优化措施(微课/演示/作业)
30
+ 3. 设计 1 个 2 分钟微课脚本(标题+3步演示)
31
+ 4. 建议 1 个课后作业(验证学生掌握)
32
+
33
+ 【输出格式】
34
+ # 教学优化方案
35
+ ## 1. 核心痛点
36
+ ## 2. 优化措施
37
+ ## 3. 微课脚本
38
+ ## 4. 课后作业
39
+ """
40
+
41
+ # 2. 调用通义千问
42
+ advice = call_qwen(prompt)
43
+ return f"<pre style='background:#f8f9fa; padding:15px; border-radius:8px; white-space: pre-wrap;'>{advice}</pre>"
44
+
45
+
46
+ # ================== Gradio 界面 ==================
47
+ def analyze_report(file):
48
+ if not file:
49
+ return "请上传 Excel 文件", None
50
+
51
+ try:
52
+ # 1. 分析 → 桑基图
53
+ sankey_b64 = analyze_teacher_dashboard(excel_path = file.name)
54
+ # 2. 聚类图
55
+ cluster_b64, cluster_stats = cluster_and_visualize( excel_path=file.name )
56
+ # print(cluster_b64)
57
+ # print(cluster_stats)
58
+ # 3. 生成教学建议
59
+ advice = generate_teaching_advice(sankey_b64, cluster_stats)
60
+
61
+ # 4. 聚类统计表格
62
+ stats_table = """
63
+ <h3>聚类主题统计</h3>
64
+ <table border="1" style="width:100%; border-collapse: collapse; text-align:center; font-size:14px;">
65
+ <tr style="background:#f0f0f0;">
66
+ <th>聚类</th><th>主题关键词</th><th>反馈数</th><th>占比</th><th>代表句</th>
67
+ </tr>
68
+ """
69
+ for s in cluster_stats:
70
+ stats_table += f"""
71
+ <tr>
72
+ <td>{s['cluster_id']}</td>
73
+ <td><strong>{s['keyword']}</strong></td>
74
+ <td>{s['size']}</td>
75
+ <td>{s['ratio']:.1%}</td>
76
+ <td style="text-align:left; max-width:300px;">{s['rep_sentence'][:60]}...</td>
77
+ </tr>
78
+ """
79
+ stats_table += "</table>"
80
+
81
+ # 5. 最终 HTML 输出
82
+ html = f"""
83
+ <div style="font-family: 'Microsoft YaHei', sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px;">
84
+ <h1 style="text-align:center; color:#1e88e5;">EGISInsight</h1>
85
+ <p style="text-align:center; color:#555; font-size:16px;">
86
+ GIS 教学智能体 · 循证教学优化
87
+ </p>
88
+ <hr style="border: 1px solid #eee; margin: 30px 0;">
89
+
90
+ <h2 style="color:#1976d2;">1. 实验报告反馈</h2>
91
+ <img src="data:image/png;base64,{sankey_b64}"
92
+ style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
93
+
94
+ <h2 style="color:#388e3c; margin-top:40px;">2. 学生反馈聚类</h2>
95
+ <img src="data:image/png;base64,{cluster_b64}"
96
+ style="width:100%; border-radius:8px; box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
97
+
98
+ <div style="margin-top:30px;">
99
+ {stats_table}
100
+ </div>
101
+
102
+ <div style="margin-top:30px; padding:20px; background:#f8f9fa; border-radius:8px; text-align:left;">
103
+ {advice}
104
+ </div>
105
+
106
+ <p style="text-align:center; color:#999; font-size:13px; margin-top:40px;">
107
+ EGISInsight © 2025 | 从数据到教学内容改革
108
+ </p>
109
+ </div>
110
+ """
111
+ return html, None
112
+
113
+ except Exception as e:
114
+ return f"分析失败:{str(e)}", None
115
+
116
+
117
+ # ================== Gradio UI ==================
118
+ with gr.Blocks(title="教学智能体 · 实验报告分析") as demo:
119
+ gr.Markdown("# GIS实验报告智能分析系统")
120
+ gr.Markdown("**上传融合后的学生反馈 Excel → 一键生成教学决策图**")
121
+
122
+ with gr.Row():
123
+ file_input = gr.File(
124
+ label="上传 ex02.xlsx(含 s1-s4 列)",
125
+ file_types=[".xlsx"]
126
+ )
127
+
128
+ with gr.Row():
129
+ output = gr.HTML(label="分析结果")
130
+
131
+ file_input.change(analyze_report, inputs=file_input, outputs=output)
132
+
133
+ gr.Markdown("---")
134
+ gr.Markdown("**后续将接入通义千问大模型,自动生成教案、微课脚本、作业设计**")
135
+
136
+ # ================== 启动 ==================
137
+ if __name__ == "__main__":
138
+ demo.launch(
139
+ server_name="0.0.0.0",
140
+ server_port=7860,
141
+ # share=False # 改 True 可生成公网链接
142
+ share=True # 改 True 可生成公网链接
143
+ )
qwen_api.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # qwen_api.py
3
+ import requests
4
+ import json
5
+
6
+ QWEN_API_KEY = "sk-6bb5d1eb5eab468ba4e0b38451526fae" # sk-6bb5d1eb5eab468ba4e0b38451526fae
7
+ QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"
8
+
9
+ def call_qwen(prompt: str) -> str:
10
+ headers = {
11
+ "Authorization": f"Bearer {QWEN_API_KEY}",
12
+ "Content-Type": "application/json"
13
+ }
14
+ payload = {
15
+ "model": "qwen-plus",
16
+ "input": {"prompt": prompt},
17
+ "parameters": {
18
+ "result_format": "text",
19
+ "temperature": 0.7,
20
+ "top_p": 0.8
21
+ }
22
+ }
23
+ try:
24
+ resp = requests.post(QWEN_API_URL, headers=headers, json=payload, timeout=30)
25
+ result = resp.json()
26
+ return result['output']['text']
27
+ except Exception as e:
28
+ return f"【大模型调用失败】{str(e)}"
29
+
30
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.26.0
2
+ plotly
3
+ kaleido
4
+ pandas
5
+ openpyxl
6
+ scikit-learn
7
+ jieba
8
+ sentence-transformers
9
+ requests
10
+ torch
11
+ transformers
sankey_plot.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sankey_plot.py
2
+ import plotly.graph_objects as go
3
+ import matplotlib.cm as cm
4
+ import matplotlib.colors as mcolors
5
+ import base64
6
+ import pandas as pd
7
+ def plot_sankey_from_df(sankey_df: pd.DataFrame, title="问题 → 关键词共现") -> str:
8
+ if sankey_df.empty:
9
+ return "无数据"
10
+
11
+ # 过滤 top targets
12
+ top_targets = sankey_df.groupby('target')['value'].sum().sort_values(ascending=False).head(15).index
13
+ df = sankey_df[sankey_df['target'].isin(top_targets)].copy()
14
+
15
+ # 节点顺序
16
+ sources = ['S4_应用场景', 'S3_操作疑惑', 'S2_讲解需求', 'S1_难点']
17
+ sources = [s for s in sources if s in df['source'].unique()]
18
+ targets = top_targets.tolist()
19
+ all_nodes = sources + targets
20
+ node_index = {n: i for i, n in enumerate(all_nodes)}
21
+
22
+ # 颜色
23
+ source_color_map = {
24
+ 'S1_难点': '#345DA7', 'S2_讲解需求': '#3B8AC4',
25
+ 'S3_操作疑惑': '#4BB4DE', 'S4_应用场景': '#EFDBCB'
26
+ }
27
+ cmap = cm.get_cmap('Set3', len(targets))
28
+ target_colors = [mcolors.to_hex(cmap(i)) for i in range(len(targets))]
29
+ target_color_map = dict(zip(targets, target_colors))
30
+
31
+ node_colors = [source_color_map.get(n, target_color_map.get(n, '#gray')) for n in all_nodes]
32
+ link_colors = [target_color_map.get(t, '#gray') for t in df['target']]
33
+
34
+ fig = go.Figure(data=[go.Sankey(
35
+ node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5),
36
+ label=all_nodes, color=node_colors),
37
+ link=dict(
38
+ source=df['source'].map(node_index),
39
+ target=df['target'].map(node_index),
40
+ value=df['value'],
41
+ color=link_colors
42
+ )
43
+ )])
44
+
45
+ fig.update_layout(title_text=title, font=dict(family="Microsoft YaHei", size=18), width=900, height=600,
46
+ margin=dict(l=20, r=20, t=60, b=20))
47
+ # === 5. 导出高清 PNG(关键!)===
48
+ img_bytes = fig.to_image(
49
+ format="png",
50
+ width=900,
51
+ height=600,
52
+ scale=2 # 2倍 DPI → 超清!
53
+ )
54
+ return base64.b64encode(img_bytes).decode()