import sys import jieba import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from collections import defaultdict def preprocess(text): # 去除标点符号 text = re.sub(r'[^\w\s]', '', text) # 分词 words = jieba.cut(text) # 去除停用词 stopwords = ['的', '是', '我', '你', '他', '不满', '举报', '市民', '投诉', '家长', '教育局', '学校', '学生'] # 自定义停用词表 words = [word for word in words if word not in stopwords] return ' '.join(words) def extract_features(texts): #vectorizer = CountVectorizer() #transform = TfidfTransformer() #features = transform.fit_transform(vectorizer.fit_transform(texts)) tfidf_vectorizer = TfidfVectorizer()#token_pattern=r"(?u)\b\w+\b" features = tfidf_vectorizer.fit_transform(texts) return features.toarray() def cluster(features, n_clusters): kmeans = KMeans(n_clusters=n_clusters) kmeans.fit(features) labels = kmeans.labels_ return labels # def visualize(labels): # unique_labels = set(labels) # #sizes = [labels.unique(label) for label in unique_labels] # counts = np.unique(labels, return_counts=True) # sizes = counts[1] # labels = [str(label) for label in unique_labels] # plt.pie(sizes, labels=labels, autopct='%1.1f%%') # plt.axis('equal') # plt.show() def getgroups(liststr , groupnum): listarray = eval(liststr) processed_texts1 = [preprocess(str(text)) for text in listarray] # 特征提取 features = extract_features(processed_texts1) # 聚类算法 n_clusters = groupnum labels = cluster(features, n_clusters) counts = defaultdict(str) i = 0 for label in labels: counts[label] += "|" + str(listarray[i]) i = i + 1 listarray2 = [] for item in counts.items(): listarray2.append(item[1]) return listarray2 if __name__ == '__main__': try: result = getgroups(sys.argv[1],int(sys.argv[2])) print(result) except Exception as e: print(e) # result = getgroups("['学校食堂需要整改','学校食堂饭菜难吃','违规补课还是存在','学生在食堂吃不饱','学校要提高食堂饭菜质量','学校霸凌学生受欺','学校食堂饭菜质量很差','学生在食堂吃了拉肚子','举报违规补课','宁波优质学校还是太少,需要提高整体教学质量']", 4)