文本预处理

mac2025-05-17 25

# -*- coding: utf-8 -*- # @Time : 2019/10/31 14:15 # @Author : Chicker # @FileName: word2vec.py # @Software: PyCharm # @Blog ：http://blog.csdn.net/u010105243/article/ import os import jieba import collections import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer path_hockey = r'E:\研一上\数据挖掘\dataset\Case1-classification\hockey\52550' path_stop_words = r'E:\研一上\数据挖掘\dataset\stopwords.txt' root_path = r'E:\研一上\数据挖掘\dataset\Case1-classification' def get_stopwords(stop_path): stop_words_list = [word.strip() for word in open(stop_path).readlines()] return stop_words_list def get_total_words(root_path,path_stop_words): '''此函数时间复杂度巨高''' stop_words = get_stopwords(path_stop_words) total_list = [] folder, sub_folders, files = os.walk(root_path).__next__() for sub_folder in sub_folders: sub_folder_path = os.path.join(root_path,sub_folder) _, _, files = os.walk(sub_folder_path).__next__() for file in files: if file.isdigit(): lines = open(os.path.join(sub_folder_path,file),'rb').readlines() for line in lines: split_line = [word.strip().lower() for word in jieba.cut(line.strip()) if word not in stop_words and len(word)!=0] total_list.extend(split_line) total_words_count = collections.Counter(total_list) total_rid_dup = list(set(total_list)) word_encode = dict(zip(total_rid_dup,range(len(total_rid_dup)))) with open(os.path.join(root_path,'\\word_encode.txt'),'w',encoding='utf-8') as f: f.write(str(word_encode)) f.close() with open(os.path.join(root_path,'\\total_words_count.txt'),'w',encoding='utf-8') as f: f.write(str(word_encode)) f.close() return word_encode, total_words_count def get_document_vec(doc_path,stop_path,root_path): word_encode, total_words_count = get_total_words(root_path,stop_path) stop_words = get_stopwords(stop_path) words_list = [] lines = open(doc_path,'rb').readlines() for line in lines: sentence = jieba.cut(line.strip()) clear_sen = [word.lower() for word in sentence if word not in stop_words] words_list.extend(clear_sen) words_set = set(words_list) words_count = collections.Counter(words_list) if 'baseball' in doc_path.split('\\'): flag = '+1' else: flag = '-1' words_freq = [] words_encode_list = [] words_count_list = [] words_freq.append(flag) for word in words_set: if len(word) == 0 or not word.isalpha(): pass else: word_idx = word_encode.get(word,'non') word_count = words_count.get(word,0) words_encode_list.append(word_idx) words_count_list.append(word_count) words_freq.append(dict(zip(words_encode_list,words_count_list))) return words_freq def mk_document_matrix(root_path,stop_path): folder, sub_folders, files = os.walk(root_path).__next__() for sub_folder in sub_folders: sub_folder_path = os.path.join(root_path, sub_folder) _, _, files = os.walk(sub_folder_path).__next__() for file in files: if file.isdigit(): file_path = os.path.join(sub_folder_path, file) doc_vec = get_document_vec(file_path,stop_path,root_path) with open(os.path.join(root_path,'\\matrix.txt'),'a',encoding='utf-8') as f: f.write(str(doc_vec)+'\n') # mk_document_matrix(root_path,path_stop_words) def prepare(root_path,stop_path): counter = CountVectorizer() trans = TfidfTransformer() stop_words = get_stopwords(stop_path) doc1 = [] total_doc1 = [] doc2 = [] total_doc2 = [] folder, sub_folders, files = os.walk(root_path).__next__() for sub_folder in sub_folders: if sub_folder == 'baseball': sub_folder_path = os.path.join(root_path, sub_folder) _, _, files = os.walk(sub_folder_path).__next__() for file in files: if file.isdigit(): doc1c = doc1.copy() lines = open(os.path.join(sub_folder_path, file), 'rb').readlines() for line in lines: doc1c.extend([word.strip().lower() for word in jieba.cut(line.strip()) if word not in stop_words and word.isalpha() and len(word)<15]) total_doc1.append(' '.join(doc1c)) if sub_folder == 'hockey': sub_folder_path = os.path.join(root_path, sub_folder) _, _, files = os.walk(sub_folder_path).__next__() for file in files: if file.isdigit(): doc2c = doc2.copy() lines = open(os.path.join(sub_folder_path, file), 'rb').readlines() for line in lines: doc2c.extend([word.strip().lower() for word in jieba.cut(line.strip()) if word not in stop_words and word.isalpha() and len(word)<15]) total_doc2.append(' '.join(doc2c)) num1 = len(total_doc1) num2 = len(total_doc2) label = np.concatenate((np.ones((num1,1)),np.zeros((num2,1))),axis=0) print(label) total_doc1.extend(total_doc2) x = counter.fit_transform(total_doc1) y = trans.fit_transform(x).toarray() data = np.concatenate((y,label),axis=1) print(data[:,-1]) features = counter.get_feature_names() features.append('label') df2 = pd.DataFrame(data,columns=features) print(df2.head(2)) df2.to_csv('./data.csv') def main(): mk_document_matrix(root_path,path_stop_words) prepare(root_path,path_stop_words) if __name__ == '__main__': main()

最新回复(0)