import os
import jieba
import collections
import pandas
as pd
import numpy
as np
from sklearn
.feature_extraction
.text
import CountVectorizer
,TfidfTransformer
path_hockey
= r
'E:\研一上\数据挖掘\dataset\Case1-classification\hockey\52550'
path_stop_words
= r
'E:\研一上\数据挖掘\dataset\stopwords.txt'
root_path
= r
'E:\研一上\数据挖掘\dataset\Case1-classification'
def get_stopwords(stop_path
):
stop_words_list
= [word
.strip
() for word
in open(stop_path
).readlines
()]
return stop_words_list
def get_total_words(root_path
,path_stop_words
):
'''此函数时间复杂度巨高'''
stop_words
= get_stopwords
(path_stop_words
)
total_list
= []
folder
, sub_folders
, files
= os
.walk
(root_path
).__next__
()
for sub_folder
in sub_folders
:
sub_folder_path
= os
.path
.join
(root_path
,sub_folder
)
_
, _
, files
= os
.walk
(sub_folder_path
).__next__
()
for file in files
:
if file.isdigit
():
lines
= open(os
.path
.join
(sub_folder_path
,file),'rb').readlines
()
for line
in lines
:
split_line
= [word
.strip
().lower
() for word
in jieba
.cut
(line
.strip
()) if word
not in stop_words
and len(word
)!=0]
total_list
.extend
(split_line
)
total_words_count
= collections
.Counter
(total_list
)
total_rid_dup
= list(set(total_list
))
word_encode
= dict(zip(total_rid_dup
,range(len(total_rid_dup
))))
with open(os
.path
.join
(root_path
,'\\word_encode.txt'),'w',encoding
='utf-8') as f
:
f
.write
(str(word_encode
))
f
.close
()
with open(os
.path
.join
(root_path
,'\\total_words_count.txt'),'w',encoding
='utf-8') as f
:
f
.write
(str(word_encode
))
f
.close
()
return word_encode
, total_words_count
def get_document_vec(doc_path
,stop_path
,root_path
):
word_encode
, total_words_count
= get_total_words
(root_path
,stop_path
)
stop_words
= get_stopwords
(stop_path
)
words_list
= []
lines
= open(doc_path
,'rb').readlines
()
for line
in lines
:
sentence
= jieba
.cut
(line
.strip
())
clear_sen
= [word
.lower
() for word
in sentence
if word
not in stop_words
]
words_list
.extend
(clear_sen
)
words_set
= set(words_list
)
words_count
= collections
.Counter
(words_list
)
if 'baseball' in doc_path
.split
('\\'):
flag
= '+1'
else:
flag
= '-1'
words_freq
= []
words_encode_list
= []
words_count_list
= []
words_freq
.append
(flag
)
for word
in words_set
:
if len(word
) == 0 or not word
.isalpha
():
pass
else:
word_idx
= word_encode
.get
(word
,'non')
word_count
= words_count
.get
(word
,0)
words_encode_list
.append
(word_idx
)
words_count_list
.append
(word_count
)
words_freq
.append
(dict(zip(words_encode_list
,words_count_list
)))
return words_freq
def mk_document_matrix(root_path
,stop_path
):
folder
, sub_folders
, files
= os
.walk
(root_path
).__next__
()
for sub_folder
in sub_folders
:
sub_folder_path
= os
.path
.join
(root_path
, sub_folder
)
_
, _
, files
= os
.walk
(sub_folder_path
).__next__
()
for file in files
:
if file.isdigit
():
file_path
= os
.path
.join
(sub_folder_path
, file)
doc_vec
= get_document_vec
(file_path
,stop_path
,root_path
)
with open(os
.path
.join
(root_path
,'\\matrix.txt'),'a',encoding
='utf-8') as f
:
f
.write
(str(doc_vec
)+'\n')
def prepare(root_path
,stop_path
):
counter
= CountVectorizer
()
trans
= TfidfTransformer
()
stop_words
= get_stopwords
(stop_path
)
doc1
= []
total_doc1
= []
doc2
= []
total_doc2
= []
folder
, sub_folders
, files
= os
.walk
(root_path
).__next__
()
for sub_folder
in sub_folders
:
if sub_folder
== 'baseball':
sub_folder_path
= os
.path
.join
(root_path
, sub_folder
)
_
, _
, files
= os
.walk
(sub_folder_path
).__next__
()
for file in files
:
if file.isdigit
():
doc1c
= doc1
.copy
()
lines
= open(os
.path
.join
(sub_folder_path
, file), 'rb').readlines
()
for line
in lines
:
doc1c
.extend
([word
.strip
().lower
() for word
in jieba
.cut
(line
.strip
()) if word
not in stop_words
and word
.isalpha
() and len(word
)<15])
total_doc1
.append
(' '.join
(doc1c
))
if sub_folder
== 'hockey':
sub_folder_path
= os
.path
.join
(root_path
, sub_folder
)
_
, _
, files
= os
.walk
(sub_folder_path
).__next__
()
for file in files
:
if file.isdigit
():
doc2c
= doc2
.copy
()
lines
= open(os
.path
.join
(sub_folder_path
, file), 'rb').readlines
()
for line
in lines
:
doc2c
.extend
([word
.strip
().lower
() for word
in jieba
.cut
(line
.strip
()) if word
not in stop_words
and word
.isalpha
() and len(word
)<15])
total_doc2
.append
(' '.join
(doc2c
))
num1
= len(total_doc1
)
num2
= len(total_doc2
)
label
= np
.concatenate
((np
.ones
((num1
,1)),np
.zeros
((num2
,1))),axis
=0)
print(label
)
total_doc1
.extend
(total_doc2
)
x
= counter
.fit_transform
(total_doc1
)
y
= trans
.fit_transform
(x
).toarray
()
data
= np
.concatenate
((y
,label
),axis
=1)
print(data
[:,-1])
features
= counter
.get_feature_names
()
features
.append
('label')
df2
= pd
.DataFrame
(data
,columns
=features
)
print(df2
.head
(2))
df2
.to_csv
('./data.csv')
def main():
mk_document_matrix
(root_path
,path_stop_words
)
prepare
(root_path
,path_stop_words
)
if __name__
== '__main__':
main
()
转载请注明原文地址: https://mac.8miu.com/read-502963.html