12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- import sys
- import jieba
- import re
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.cluster import KMeans
- from collections import defaultdict
- def preprocess(text):
- # 去除标点符号
- text = re.sub(r'[^\w\s]', '', text)
- # 分词
- words = jieba.cut(text)
- # 去除停用词
- stopwords = ['的', '是', '我', '你', '他', '不满', '举报', '市民', '投诉', '家长', '教育局', '学校', '学生'] # 自定义停用词表
- words = [word for word in words if word not in stopwords]
- return ' '.join(words)
- def extract_features(texts):
- #vectorizer = CountVectorizer()
- #transform = TfidfTransformer()
- #features = transform.fit_transform(vectorizer.fit_transform(texts))
- tfidf_vectorizer = TfidfVectorizer()#token_pattern=r"(?u)\b\w+\b"
- features = tfidf_vectorizer.fit_transform(texts)
- return features.toarray()
- def cluster(features, n_clusters):
- kmeans = KMeans(n_clusters=n_clusters)
- kmeans.fit(features)
- labels = kmeans.labels_
- return labels
- # def visualize(labels):
- # unique_labels = set(labels)
- # #sizes = [labels.unique(label) for label in unique_labels]
- # counts = np.unique(labels, return_counts=True)
- # sizes = counts[1]
- # labels = [str(label) for label in unique_labels]
- # plt.pie(sizes, labels=labels, autopct='%1.1f%%')
- # plt.axis('equal')
- # plt.show()
- def getgroups(liststr , groupnum):
- listarray = eval(liststr)
- processed_texts1 = [preprocess(str(text)) for text in listarray]
- # 特征提取
- features = extract_features(processed_texts1)
- # 聚类算法
- n_clusters = groupnum
- labels = cluster(features, n_clusters)
- counts = defaultdict(str)
- i = 0
- for label in labels:
- counts[label] += "|" + str(listarray[i])
- i = i + 1
- listarray2 = []
- for item in counts.items():
- listarray2.append(item[1])
- return listarray2
- if __name__ == '__main__':
- try:
- result = getgroups(sys.argv[1],int(sys.argv[2]))
- print(result)
- except Exception as e:
- print(e)
- # result = getgroups("['学校食堂需要整改','学校食堂饭菜难吃','违规补课还是存在','学生在食堂吃不饱','学校要提高食堂饭菜质量','学校霸凌学生受欺','学校食堂饭菜质量很差','学生在食堂吃了拉肚子','举报违规补课','宁波优质学校还是太少,需要提高整体教学质量']", 4)
|