egisinsight / feedback_analyzer.py
wxy01giser's picture
Upload 8 files
a215fac verified
raw
history blame
9.2 kB
"""
生成停用词和桑基图数据
"""
import pandas as pd
import jieba
import re
from collections import Counter
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import warnings
from matplotlib.font_manager import FontProperties
import os
warnings.filterwarnings('ignore')
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
# domain_vocab.py
DOMAIN_VOCAB = [
# 中文通用领域词(去重 + 合并)
"空间连接", "字段计算器", "建筑面积", "城市规划", "叠加分析", "空间连接功能",
"数据表", "建筑层数", "地理处理", "相交功能", "现状地块", "相交叠加",
"地块属性", "分地块", "容积率统计", "计算方法", "参数设置", "软件设置",
"核密度分析", "热点分析", "带宽", "密度场", "焦点", "焦点统计",
"地图代数", "条件分析", "差运算", "最大值", "交通", "像元大小",
"参数", "凸包", "餐饮", "住宿", "搜索半径", "栅格计算器", "重分类", "Con函数",
# 英文通用领域词(去重)
"ArcGIS", "spatial join", "ArcMap", "Map algebra", "Kernel Density",
"Con", "Getis - Ord Gi*", "NDVI", "Raster Calculator", "dwg", "catalog",
"spatial join", "data manager", "POI",
]
def generate_domain_stopwords(df, text_columns, domain_keywords):
"""结合领域关键词生成停用词表"""
# 1. 基础停用词(通用功能词)
common_stopwords = {"的", "了", "在", "是", "是否", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "这个", "那个", "什么", "怎么",
"哪里", "时候", "然后", "可能", "应该", "可以", "就是", "还是", "但是", "不过", "如果", "因为", "所以", "而且", "或者", "其实", "觉得", "认为", "希望", "能够", "需要", "知道", "表示", "这样", "那样", "这些", "那些", "有点", "一点",
"一些","进一步", "具体", "问题", "疑惑", "讲解", "需求", "难点", "操作", "应用", "场景", "进行", "对于", "实际", "情况", "结合", "对于",
"学生", "老师", "实验", "报告", "作业", "课程", "课堂", "学习", "理解", "掌握", "明白", "清楚",
"建议", "希望", "请问", "想问", "不懂", "不会", "不知道", "不太会", "不太懂", "不太清楚",
}
# 2. 领域关键词(如果有)
domain_words = set(domain_keywords)
# if domain_keywords_file and os.path.exists(domain_keywords_file):
# with open(domain_keywords_file, "r", encoding="utf-8") as f:
# domain_words = set([line.strip() for line in f if line.strip()])
# 3. 合并文本并预处理
all_text = ""
for col in text_columns:
all_text += " ".join(df[col].fillna("").astype(str))
all_text = re.sub(r"[^\w\s]", "", all_text)
all_text = re.sub(r"\d+", "", all_text)
all_text = all_text.lower() # 统一英文为小写(新增)
words = jieba.lcut(all_text)
# 4. 统计词频
word_freq = Counter(words)
word_freq = {word: freq for word, freq in word_freq.items() if len(word) > 1}
# 5. 生成候选停用词
# - 通用功能词
# - 高频但非领域关键词的词汇(出现频率>50%且不在领域词表中)
stopwords = common_stopwords.copy()
for word, freq in word_freq.items():
if freq > len(df) * 0.5 and word not in domain_words:
stopwords.add(word)
# 6. 保存停用词表
# with open(output_path, "w", encoding="utf-8") as f:
# for word in stopwords:
# f.write(word + "\n")
# print(f"领域定制化停用词表已生成,共{len(stopwords)}个词,保存至{output_path}")
return stopwords
def load_and_preprocess_data(df, stopwords, domain_words):
"""加载数据并进行预处理(优化专业词汇识别)"""
stopwords = set(stopwords)
# with open(stopwords_path, "r", encoding="utf-8") as f:
# for line in f:
# stopwords.add(line.strip())
question_types = {
"s1": "难点",
"s2": "讲解需求",
"s3": "操作疑惑",
"s4": "应用场景"
}
# 定义问题因果顺序(索引越小越靠前)
question_hierarchy = ["s1", "s2", "s3", "s4"]
# 创建专业词汇词典(提前加载所有领域词)
professional_dict = {}
# 添加域词词汇
for word in domain_words:
jieba.add_word(word, freq=10000) # 高频率确保优先识别
professional_dict[word] = 1
# 创建专业词汇的正则表达式模式
# 按长度降序排序,确保长词优先匹配
sorted_domain = sorted(domain_words, key=len, reverse=True)
pattern_str = "|".join(re.escape(word) for word in sorted_domain)
professional_pattern = re.compile(f"({pattern_str})")
def clean_text(text):
if not isinstance(text, str):
return []
# 第一步:基础清洗
text_cleaned = re.sub(r"[^\w\s]", "", text)
text_cleaned = re.sub(r"\d+", "", text_cleaned)
text_cleaned = text_cleaned.lower()
# 第二步:专业词汇识别和标记
# 找到所有专业词汇的位置
matches = []
for match in professional_pattern.finditer(text_cleaned):
start, end = match.span()
matches.append((start, end, text_cleaned[start:end]))
# 第三步:分割文本(保护专业词汇)
segments = []
last_end = 0
# 按匹配位置分割文本
for start, end, word in matches:
# 添加前面的普通文本
if start > last_end:
segments.append(text_cleaned[last_end:start])
# 添加专业词汇(作为整体)
segments.append(word)
last_end = end
# 添加最后一段文本
if last_end < len(text_cleaned):
segments.append(text_cleaned[last_end:])
# 第四步:对非专业词汇部分进行分词
final_words = []
for segment in segments:
if segment in professional_dict:
# 专业词汇直接添加
final_words.append(segment)
else:
# 普通文本进行分词
words = jieba.lcut(segment)
words = [w for w in words if w not in stopwords and len(w) > 1]
final_words.extend(words)
return final_words
for col in ["s1", "s2", "s3", "s4"]:
df[col + "_words"] = df[col].apply(clean_text)
return df, question_types, question_hierarchy
def build_sankey_data(df, question_columns, top_n=30):
"""
构建桑基图用的数据(DataFrame: source, target, value)
- 仅保留前 top_n 个全局高频关键词
"""
question_labels = {
"s1": "S1_难点",
"s2": "S2_讲解需求",
"s3": "S3_操作疑惑",
"s4": "S4_应用场景"
}
# 1. 统计全局关键词频率
all_keywords = []
for col in question_columns:
all_keywords.extend([kw for kws in df[col + "_words"] for kw in kws])
keyword_freq = Counter(all_keywords)
core_keywords = set([kw for kw, _ in keyword_freq.most_common(top_n)])
# 2. 构建桑基图原始数据
link_counter = Counter()
for _, row in df.iterrows():
for q in question_columns:
q_label = question_labels[q]
keywords = row[q + "_words"]
for kw in keywords:
if kw in core_keywords:
link_counter[(q_label, kw)] += 1
# 3. 转为 DataFrame
sankey_data = pd.DataFrame([
{"source": src, "target": tgt, "value": val}
for (src, tgt), val in link_counter.items()
])
# sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data.csv", index=False, encoding='utf-8-sig')
# sankey_data.to_csv("E:\\data\\20250621Edu\\sankey_d\\sankey_data2.csv", index=False, encoding='utf-8-sig')
print("桑基图保存成功!")
return sankey_data
def generate_sankey_df(file_path, text_columns, domain_words, top_n=30):
df = pd.read_excel(file_path)
stopwords = generate_domain_stopwords(df, text_columns, domain_words)
df, question_types, question_hierarchy = load_and_preprocess_data(df, stopwords, domain_words)
sankey_data = build_sankey_data(df, text_columns, top_n)
return sankey_data
if __name__ == '__main__':
"""主函数:执行完整的分析流程"""
file_path = "E:\\data\\20250621Edu\\ex02.xlsx"
stopwords_path = "E:\\data\\20250621Edu\\stop\\stop2.txt"
text_columns = ["s1", "s2", "s3", "s4"]
top_n = 30 # 设置频数排在前30的关键词
sankey_data = generate_sankey_df(file_path, text_columns, DOMAIN_VOCAB, top_n)