文本相似度算法的对比及python实现
前言
通常我们有这样的需求:对两篇文章或者产品内容进行重复率查询。
为了解决类似的问题,罗列了一些常见的相似度算法,用python代码实现。
五种常见的相似度算法:余弦相似度(cosine_similarity)、jaccard相似度、编辑距离(Levenshtein)、MinHash、SimHash + 海明距离。
代码是一位前辈留下的,做一下整理分享出来。算法的具体理论这里就不硬搬生套了,大家可以自行搜索。有任何问题欢迎留言,谢谢!
余弦相似度cosine_similarity
import re
import html
import jieba
import jieba
.analyse
from sklearn
.metrics
.pairwise
import cosine_similarity
class CosineSimilarity(object):
"""
余弦相似度
"""
def __init__(self
, content_x1
, content_y2
):
self
.s1
= content_x1
self
.s2
= content_y2
@
staticmethod
def extract_keyword(content
):
re_exp
= re
.compile(r
'(<style>.*?</style>)|(<[^>]+>)', re
.S
)
content
= re_exp
.sub
(' ', content
)
content
= html
.unescape
(content
)
seg
= [i
for i
in jieba
.cut
(content
, cut_all
=True) if i
!= '']
keywords
= jieba
.analyse
.extract_tags
("|".join
(seg
), topK
=200, withWeight
=False)
return keywords
@
staticmethod
def one_hot(word_dict
, keywords
):
cut_code
= [0]*len(word_dict
)
for word
in keywords
:
cut_code
[word_dict
[word
]] += 1
return cut_code
def main(self
):
jieba
.analyse
.set_stop_words
('./files/stopwords.txt')
keywords1
= self
.extract_keyword
(self
.s1
)
keywords2
= self
.extract_keyword
(self
.s2
)
union
= set(keywords1
).union
(set(keywords2
))
word_dict
= {}
i
= 0
for word
in union
:
word_dict
[word
] = i
i
+= 1
s1_cut_code
= self
.one_hot
(word_dict
, keywords1
)
s2_cut_code
= self
.one_hot
(word_dict
, keywords2
)
sample
= [s1_cut_code
, s2_cut_code
]
try:
sim
= cosine_similarity
(sample
)
return sim
[1][0]
except Exception
as e
:
print(e
)
return 0.0
if __name__
== '__main__':
with open('./files/sample_x.txt', 'r') as x
, open('./files/sample_y.txt', 'r') as y
:
content_x
= x
.read
()
content_y
= y
.read
()
similarity
= CosineSimilarity
(content_x
, content_y
)
similarity
= similarity
.main
()
print('相似度: %.2f%%' % (similarity
*100))
输出结果:
Building prefix
dict from the default dictionary
...
Loading model
from cache
/tmp
/jieba
.cache
Loading model cost
0.915 seconds
.
Prefix
dict has been built succesfully
.
相似度
: 79.67%
jaccard相似度
import re
import jieba
import jieba
.analyse
import html
class JaccardSimilarity(object):
"""
jaccard相似度
"""
def __init__(self
, content_x1
, content_y2
):
self
.s1
= content_x1
self
.s2
= content_y2
@
staticmethod
def extract_keyword(content
):
re_exp
= re
.compile(r
'(<style>.*?</style>)|(<[^>]+>)', re
.S
)
content
= re_exp
.sub
(' ', content
)
content
= html
.unescape
(content
)
seg
= [i
for i
in jieba
.cut
(content
, cut_all
=True) if i
!= '']
keywords
= jieba
.analyse
.extract_tags
("|".join
(seg
), topK
=200, withWeight
=False)
return keywords
def main(self
):
jieba
.analyse
.set_stop_words
('./files/stopwords.txt')
keywords_x
= self
.extract_keyword
(self
.s1
)
keywords_y
= self
.extract_keyword
(self
.s2
)
intersection
= len(list(set(keywords_x
).intersection
(set(keywords_y
))))
union
= len(list(set(keywords_x
).union
(set(keywords_y
))))
sim
= float(intersection
)/union
if union
!= 0 else 0
return sim
if __name__
== '__main__':
with open('./files/sample_x.txt', 'r') as x
, open('./files/sample_y.txt', 'r') as y
:
content_x
= x
.read
()
content_y
= y
.read
()
similarity
= JaccardSimilarity
(content_x
, content_y
)
similarity
= similarity
.main
()
print('相似度: %.2f%%' % (similarity
*100))
输出结果:
Building prefix
dict from the default dictionary
...
Loading model
from cache
/tmp
/jieba
.cache
Loading model cost
0.893 seconds
.
Prefix
dict has been built succesfully
.
相似度
: 66.20%
编辑距离Levenshtein
import re
import html
import jieba
import jieba
.analyse
import Levenshtein
class LevenshteinSimilarity(object):
"""
编辑距离
"""
def __init__(self
, content_x1
, content_y2
):
self
.s1
= content_x1
self
.s2
= content_y2
@
staticmethod
def extract_keyword(content
):
re_exp
= re
.compile(r
'(<style>.*?</style>)|(<[^>]+>)', re
.S
)
content
= re_exp
.sub
(' ', content
)
content
= html
.unescape
(content
)
seg
= [i
for i
in jieba
.cut
(content
, cut_all
=True) if i
!= '']
keywords
= jieba
.analyse
.extract_tags
("|".join
(seg
), topK
=200, withWeight
=False)
return keywords
def main(self
):
jieba
.analyse
.set_stop_words
('./files/stopwords.txt')
keywords1
= ', '.join
(self
.extract_keyword
(self
.s1
))
keywords2
= ', '.join
(self
.extract_keyword
(self
.s2
))
distances
= Levenshtein
.ratio
(keywords1
, keywords2
)
return distances
if __name__
== '__main__':
with open('./files/sample_x.txt', 'r') as x
, open('./files/sample_y.txt', 'r') as y
:
content_x
= x
.read
()
content_y
= y
.read
()
distance
= LevenshteinSimilarity
(content_x
, content_y
)
distance
= distance
.main
()
print('相似度: %.2f%%' % (distance
* 100))
输出结果:
Building prefix
dict from the default dictionary
...
Loading model
from cache
/tmp
/jieba
.cache
Loading model cost
0.786 seconds
.
Prefix
dict has been built succesfully
.
相似度
: 82.24%
MinHash
import re
import jieba
import jieba
.analyse
import html
from datasketch
import MinHash
class MinHashSimilarity(object):
"""
MinHash
"""
def __init__(self
, content_x1
, content_y2
):
self
.s1
= content_x1
self
.s2
= content_y2
@
staticmethod
def extract_keyword(content
):
re_exp
= re
.compile(r
'(<style>.*?</style>)|(<[^>]+>)', re
.S
)
content
= re_exp
.sub
(' ', content
)
content
= html
.unescape
(content
)
seg
= [i
for i
in jieba
.cut
(content
, cut_all
=True) if i
!= '']
keywords
= jieba
.analyse
.extract_tags
("|".join
(seg
), topK
=200, withWeight
=False)
return keywords
def main(self
):
jieba
.analyse
.set_stop_words
('./files/stopwords.txt')
m1
, m2
= MinHash
(), MinHash
()
s1
= self
.extract_keyword
(self
.s1
)
s2
= self
.extract_keyword
(self
.s2
)
for data
in s1
:
m1
.update
(data
.encode
('utf8'))
for data
in s2
:
m2
.update
(data
.encode
('utf8'))
return m1
.jaccard
(m2
)
if __name__
== '__main__':
with open('./files/sample_x.txt', 'r') as x
, open('./files/sample_y.txt', 'r') as y
:
content_x
= x
.read
()
content_y
= y
.read
()
similarity
= MinHashSimilarity
(content_x
, content_y
)
similarity
= similarity
.main
()
print('相似度: %.2f%%' % (similarity
*100))
输出结果:
Building prefix
dict from the default dictionary
...
Loading model
from cache
/tmp
/jieba
.cache
Loading model cost
0.901 seconds
.
Prefix
dict has been built succesfully
.
相似度
: 64.84%
SimHash + 海明距离
import re
import html
import math
import jieba
import jieba
.analyse
class SimHashSimilarity(object):
"""
SimHash
"""
def __init__(self
, content_x1
, content_y2
):
self
.s1
= content_x1
self
.s2
= content_y2
@
staticmethod
def get_bin_str(source
):
if source
== "":
return 0
else:
t
= ord(source
[0]) << 7
m
= 1000003
mask
= 2 ** 128 - 1
for c
in source
:
t
= ((t
* m
) ^ ord(c
)) & mask
t
^= len(source
)
if t
== -1:
t
= -2
t
= bin(t
).replace
('0b', '').zfill
(64)[-64:]
return str(t
)
@
staticmethod
def extract_keyword(content
):
re_exp
= re
.compile(r
'(<style>.*?</style>)|(<[^>]+>)', re
.S
)
content
= re_exp
.sub
(' ', content
)
content
= html
.unescape
(content
)
seg
= [i
for i
in jieba
.cut
(content
, cut_all
=True) if i
!= '']
keywords
= jieba
.analyse
.extract_tags
("|".join
(seg
), topK
=200, withWeight
=True)
return keywords
def run(self
, keywords
):
ret
= []
for keyword
, weight
in keywords
:
bin_str
= self
.get_bin_str
(keyword
)
key_list
= []
for c
in bin_str
:
weight
= math
.ceil
(weight
)
if c
== "1":
key_list
.append
(int(weight
))
else:
key_list
.append
(-int(weight
))
ret
.append
(key_list
)
rows
= len(ret
)
cols
= len(ret
[0])
result
= []
for i
in range(cols
):
tmp
= 0
for j
in range(rows
):
tmp
+= int(ret
[j
][i
])
if tmp
> 0:
tmp
= "1"
elif tmp
<= 0:
tmp
= "0"
result
.append
(tmp
)
return "".join
(result
)
def main(self
):
jieba
.analyse
.set_stop_words
('./files/stopwords.txt')
s1
= self
.extract_keyword
(self
.s1
)
s2
= self
.extract_keyword
(self
.s2
)
sim_hash1
= self
.run
(s1
)
sim_hash2
= self
.run
(s2
)
length
= 0
for index
, char
in enumerate(sim_hash1
):
if char
== sim_hash2
[index
]:
continue
else:
length
+= 1
return length
if __name__
== '__main__':
with open('./files/sample_x.txt', 'r') as x
, open('./files/sample_y.txt', 'r') as y
:
content_x
= x
.read
()
content_y
= y
.read
()
similarity
= SimHashSimilarity
(content_x
, content_y
)
similarity
= similarity
.main
()
threshold
= 3
print(f
'海明距离:{similarity} 判定距离:{threshold} 是否相似:{similarity <= threshold}')
输出结果:
Building prefix
dict from the default dictionary
...
Loading model
from cache
/tmp
/jieba
.cache
Loading model cost
0.903 seconds
.
Prefix
dict has been built succesfully
.
海明距离:
17 判定距离:
3 是否相似:
False
讨论
几种算法对比:
余弦相似度
计算复杂度偏高。
相关研究中,基于物品协同过滤系统的相似性度量方法普遍使用余弦相似性。 然而,在许多实际应用中,数据稀疏度过高,通过余弦相似度计算会产生误导性结果。
jaccard相似度
在产品描述中,很多运营人员为了偷懒,喜欢复制粘贴稍作修改,造成产品描述重复度高。通过提取产品描述的关键词,再计算两组关键词的交集并集非常适合在此场景下检测产品描述的重复度,即杰卡德相似度。
编辑距离
计算复杂度偏高,且在此场景效果并不理想。
MinHash
在大数据集中求杰尔德相似度的解决方案,通过对数据文本的降维,大大提高计算速度。
SimHash + 海明距离
对单词数量低于500的文章误差较大。
MinHash与SimHash的区别:https://blog.csdn.net/dm_ustc/article/details/45626907
根据需求场景,选择合适的算法,或者集成算法。代码详情见github:https://github.com/downdawn/Similarity
笔记,有任何问题欢迎留言,谢谢!