import os
import jieba
import collections
import pandas 
as pd
import numpy 
as np
from sklearn
.feature_extraction
.text 
import CountVectorizer
,TfidfTransformer
path_hockey 
= r
'E:\研一上\数据挖掘\dataset\Case1-classification\hockey\52550'
path_stop_words 
= r
'E:\研一上\数据挖掘\dataset\stopwords.txt'
root_path 
= r
'E:\研一上\数据挖掘\dataset\Case1-classification'
def get_stopwords(stop_path
):
    stop_words_list 
= [word
.strip
() for word 
in open(stop_path
).readlines
()]
    return stop_words_list
def get_total_words(root_path
,path_stop_words
):
    '''此函数时间复杂度巨高'''
    stop_words 
= get_stopwords
(path_stop_words
)
    total_list 
= []
    folder
, sub_folders
, files 
= os
.walk
(root_path
).__next__
()
    for sub_folder 
in sub_folders
:
        sub_folder_path 
= os
.path
.join
(root_path
,sub_folder
)
        _
, _
, files 
= os
.walk
(sub_folder_path
).__next__
()
        for file in files
:
            if file.isdigit
():
                lines 
= open(os
.path
.join
(sub_folder_path
,file),'rb').readlines
()
                for line 
in lines
:
                    split_line 
= [word
.strip
().lower
() for word 
in jieba
.cut
(line
.strip
()) if word 
not in stop_words 
and len(word
)!=0]
                    total_list
.extend
(split_line
)
    total_words_count 
= collections
.Counter
(total_list
)
    total_rid_dup 
= list(set(total_list
))
    word_encode 
= dict(zip(total_rid_dup
,range(len(total_rid_dup
))))
    with open(os
.path
.join
(root_path
,'\\word_encode.txt'),'w',encoding
='utf-8') as f
:
        f
.write
(str(word_encode
))
        f
.close
()
    with open(os
.path
.join
(root_path
,'\\total_words_count.txt'),'w',encoding
='utf-8') as f
:
        f
.write
(str(word_encode
))
        f
.close
()
    return word_encode
, total_words_count
def get_document_vec(doc_path
,stop_path
,root_path
):
    word_encode
, total_words_count 
= get_total_words
(root_path
,stop_path
)
    stop_words 
= get_stopwords
(stop_path
)
    words_list 
= []
    lines 
= open(doc_path
,'rb').readlines
()
    for line 
in lines
:
        sentence 
= jieba
.cut
(line
.strip
())
        clear_sen 
= [word
.lower
() for word 
in sentence 
if word 
not in stop_words
]
        words_list
.extend
(clear_sen
)
    words_set 
= set(words_list
)
    words_count 
= collections
.Counter
(words_list
)
    if 'baseball' in doc_path
.split
('\\'):
        flag 
= '+1'
    else:
        flag 
= '-1'
    words_freq 
= []
    words_encode_list 
= []
    words_count_list 
= []
    words_freq
.append
(flag
)
    for word 
in words_set
:
        if len(word
) == 0 or not word
.isalpha
():
            pass
        else:
            word_idx 
= word_encode
.get
(word
,'non')
            word_count 
= words_count
.get
(word
,0)
            words_encode_list
.append
(word_idx
)
            words_count_list
.append
(word_count
)
    words_freq
.append
(dict(zip(words_encode_list
,words_count_list
)))
    return words_freq
def mk_document_matrix(root_path
,stop_path
):
    folder
, sub_folders
, files 
= os
.walk
(root_path
).__next__
()
    for sub_folder 
in sub_folders
:
        sub_folder_path 
= os
.path
.join
(root_path
, sub_folder
)
        _
, _
, files 
= os
.walk
(sub_folder_path
).__next__
()
        for file in files
:
            if file.isdigit
():
                file_path 
= os
.path
.join
(sub_folder_path
, file)
                doc_vec 
= get_document_vec
(file_path
,stop_path
,root_path
)
                with open(os
.path
.join
(root_path
,'\\matrix.txt'),'a',encoding
='utf-8') as f
:
                    f
.write
(str(doc_vec
)+'\n')
def prepare(root_path
,stop_path
):
    counter 
= CountVectorizer
()
    trans 
= TfidfTransformer
()
    stop_words 
= get_stopwords
(stop_path
)
    doc1 
= []
    total_doc1 
= []
    doc2 
= []
    total_doc2 
= []
    folder
, sub_folders
, files 
= os
.walk
(root_path
).__next__
()
    for sub_folder 
in sub_folders
:
        if sub_folder 
== 'baseball':
            sub_folder_path 
= os
.path
.join
(root_path
, sub_folder
)
            _
, _
, files 
= os
.walk
(sub_folder_path
).__next__
()
            for file in files
:
                if file.isdigit
():
                    doc1c 
= doc1
.copy
()
                    lines 
= open(os
.path
.join
(sub_folder_path
, file), 'rb').readlines
()
                    for line 
in lines
:
                        doc1c
.extend
([word
.strip
().lower
() for word 
in jieba
.cut
(line
.strip
()) if word 
not in stop_words 
and word
.isalpha
() and len(word
)<15])
                    total_doc1
.append
(' '.join
(doc1c
))
        if sub_folder 
== 'hockey':
            sub_folder_path 
= os
.path
.join
(root_path
, sub_folder
)
            _
, _
, files 
= os
.walk
(sub_folder_path
).__next__
()
            for file in files
:
                if file.isdigit
():
                    doc2c 
= doc2
.copy
()
                    lines 
= open(os
.path
.join
(sub_folder_path
, file), 'rb').readlines
()
                    for line 
in lines
:
                        doc2c
.extend
([word
.strip
().lower
() for word 
in jieba
.cut
(line
.strip
()) if word 
not in stop_words 
and word
.isalpha
() and len(word
)<15])
                    total_doc2
.append
(' '.join
(doc2c
))
    num1 
= len(total_doc1
)
    num2 
= len(total_doc2
)
    label 
= np
.concatenate
((np
.ones
((num1
,1)),np
.zeros
((num2
,1))),axis
=0)
    print(label
)
    total_doc1
.extend
(total_doc2
)
    x 
= counter
.fit_transform
(total_doc1
)
    y 
= trans
.fit_transform
(x
).toarray
()
    data 
= np
.concatenate
((y
,label
),axis
=1)
    print(data
[:,-1])
    features 
= counter
.get_feature_names
()
    features
.append
('label')
    df2 
= pd
.DataFrame
(data
,columns
=features
)
    print(df2
.head
(2))
    df2
.to_csv
('./data.csv')
    
def main():
    mk_document_matrix
(root_path
,path_stop_words
)
    prepare
(root_path
,path_stop_words
)
    
if __name__ 
== '__main__':
    main
()
                
                
                
        
    
 
                    转载请注明原文地址: https://mac.8miu.com/read-502963.html