TextMatch
TextMatch is a semantic matching model library for QA & text search … It’s easy to train models and to export representation vectors.
TextMatch/tests模块包含 :
(1)core_test
import sys
from textmatch.core.qa_match import QMatch, AMatch, SemanticMatch
test_dict = {"id0": "其实事物发展有自己的潮流和规律",
"id1": "当你身处潮流之中的时候,要紧紧抓住潮流的机会",
"id2": "想办法脱颖而出,即使没有成功,也会更加洞悉时代的脉搏",
"id3": "收获珍贵的知识和经验。而如果潮流已经退去",
"id4": "这个时候再去往这个方向上努力,只会收获迷茫与压抑",
"id5": "对时代、对自己都没有什么帮助",
"id6": "但是时代的浪潮犹如海滩上的浪花,总是一浪接着一浪,只要你站在海边,身处这个行业之中,下一个浪潮很快又会到来。你需要敏感而又深刻地去观察,略去那些浮躁的泡沫,抓住真正潮流的机会,奋力一搏,不管成败,都不会遗憾。"}
def test_q_match(testword):
# QMatch
q_match = QMatch( q_dict=test_dict, match_models=['bow', 'tfidf', 'ngram_tfidf'])
q_match_pre = q_match.predict(testword, match_strategy='score', vote_threshold=0.5, key_weight = {'bow': 1, 'tfidf': 1, 'ngram_tfidf': 1})
print ('q_match_pre>>>>>', q_match_pre )
return q_match_pre
def test_a_match(testword):
# AMatch
a_match = AMatch( a_dict=test_dict, match_models=['bow', 'tfidf', 'ngram_tfidf'])
a_match_pre = a_match.predict(testword, ['id0', 'id1'], match_strategy='score', vote_threshold=0.5, key_weight = {'bow': 1, 'tfidf': 1, 'ngram_tfidf': 1})
print ('a_match_pre>>>>>', a_match_pre )
# a_match_pre>>>>> {'id0': 1.0, 'id1': 0.0}
return a_match_pre
def test_semantic_match(testword,words_dict=test_dict):
# SemanticMatch
s_match = SemanticMatch( words_dict=words_dict, match_models=['bow', 'tfidf', 'ngram_tfidf'] )
s_match_pre = s_match.predict(testword, ['id0','id1', "id5"], match_strategy='score', vote_threshold=0.5, key_weight = {'bow': 1, 'tfidf': 1, 'ngram_tfidf': 1})
print ('s_match_pre>>>>>', s_match_pre )
# s_match_pre>>>>> {'id0': 1.0, 'id1': 0.0}
return s_match_pre
if __name__ == '__main__':
testword = "其实事物发展有自己的潮流和规律"
test_q_match(testword)
test_a_match(testword)
test_semantic_match(testword)运行结果:
q_match_pre>>>>> {'id0': 0.9999993948153113} a_match_pre>>>>> {'id0': 0.9999993948153113, 'id1': 0.22259270511979246} s_match_pre>>>>> {'id0': 0.9999993948153113, 'id1': 0.22259270511979246, 'id5': 0.09423726836364266}
import sys
import json
from textmatch.config.constant import Constant as const
from textmatch.core.text_embedding import TextEmbedding
test_dict = {"id0": "其实事物发展有自己的潮流和规律",
"id1": "当你身处潮流之中的时候,要紧紧抓住潮流的机会",
"id2": "想办法脱颖而出,即使没有成功,也会更加洞悉时代的脉搏",
"id3": "收获珍贵的知识和经验。而如果潮流已经退去",
"id4": "这个时候再去往这个方向上努力,只会收获迷茫与压抑",
"id5": "对时代、对自己都没有什么帮助",
"id6": "但是时代的浪潮犹如海滩上的浪花,总是一浪接着一浪,只要你站在海边,身处这个行业之中,下一个浪潮很快又会到来。你需要敏感而又深刻地去观察,略去那些浮躁的泡沫,抓住真正潮流的机会,奋力一搏,不管成败,都不会遗憾。"}
if __name__ == '__main__':
# ['bow', 'tfidf', 'ngram_tfidf', 'bert']
# ['bow', 'tfidf', 'ngram_tfidf', 'bert', 'w2v']
text_embedding = TextEmbedding( match_models=['bow', 'tfidf', 'ngram_tfidf', 'w2v'], words_dict=test_dict )
pre = text_embedding.predict( "其实事物发展有自己的潮流和规律" )
print ('text_embedding>>>>>', pre)
pre = text_embedding.predict( "其实事物发展有自己的潮流和规律", "id1" )
print ('text_embedding>>>>>', pre) (2)models_test
import sys
from textmatch.models.text_search.bm25 import BM25
from textmatch.config.constant import Constant as const
if __name__ == '__main__':
# 存放问句
words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
bm25 = BM25()
bm25.init(words_list, update=True)
testword = "我在九寨沟,很喜欢"
pre = bm25.predict(testword)
print ('pre>>>>>', pre) pre>>>>> [-1.27683889 -1.41282764 0.83974856]
import sys
from textmatch.models.text_search.edit_sim import EditDistance
from textmatch.config.constant import Constant as const
if __name__ == '__main__':
# 存放问句
words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
edit_dis = EditDistance()
edit_dis.init(words_list)
testword = "我在九寨沟,很喜欢"
pre = edit_dis.predict(testword)
print ('pre>>>>>', pre) pre>>>>> [0.25, 0.368421052631579, 0.5]
import sys
from textmatch.models.text_search.jaccard_sim import Jaccard
from textmatch.config.constant import Constant as const
if __name__ == '__main__':
# 存放问句
words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
jaccard_dis = Jaccard()
jaccard_dis.init(words_list)
testword = "我在九寨沟,很喜欢"
pre = jaccard_dis.predict(testword)
print ('pre>>>>>', pre) pre>>>>> [0.23529411764705882, 0.3125, 0.6]
import sys
from textmatch.models.text_embedding.bow_sklearn import Bow
from textmatch.config.constant import Constant as const
if __name__ == '__main__':
# 存放问句
words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
bow = Bow(dic_path=const.BOW_DIC_PATH, bow_index_path=const.BOW_INDEX_PARH, )
bow.init(words_list, update=True)
testword = "我在九寨沟,很喜欢"
#for word in jieba.cut(testword):
# print ('>>>>', word)
pre = bow.predict(testword)
print ('pre>>>>>', pre)
pre = bow._predict(testword)[0]
print ('pre>>>>>', pre) pre>>>>> [0.27735009 0.53033008 0.86602539] pre>>>>> [1. 0. 1. 1. 0. 1. 0.]
import sys
from textmatch.models.text_embedding.tf_idf_sklearn import TfIdf
from textmatch.config.constant import Constant as const
if __name__ == '__main__':
# 存放问句
words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
tfidf = TfIdf(dic_path=const.TFIDF_DIC_PATH, tfidf_model_path=const.TFIDF_MODEL_PATH, tfidf_index_path=const.TFIDF_INDEX_PATH, )
tfidf.init(words_list, update=True)
testword = "我在九寨沟,很喜欢"
#for word in jieba.cut(testword):
# print ('>>>>', word)
pre = tfidf.predict(testword)
print ('pre>>>>>', pre)
pre = tfidf._predict(testword)[0]
print ('pre>>>>>', pre) pre>>>>> [0.21094354 0.45357592 0.87701746] pre>>>>> [0.63174505 0. 0.4804584 0.4804584 0. 0.37311881 0. ]
import sys
from textmatch.models.text_embedding.ngram_tf_idf_sklearn import NgramTfIdf
from textmatch.config.constant import Constant as const
if __name__ == '__main__':
# 存放问句
words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
tfidf = NgramTfIdf(dic_path=const.NGRAM_TFIDF_DIC_PATH, tfidf_model_path=const.NGRAM_TFIDF_MODEL_PATH, tfidf_index_path=const.NGRAM_TFIDF_INDEX_PATH, )
tfidf.init(words_list, update=True)
testword = "我在九寨沟,很喜欢"
#for word in jieba.cut(testword):
# print ('>>>>', word)
pre = tfidf.predict(testword)
print ('pre>>>>>', pre)
pre = tfidf._predict(testword)[0]
print ('pre>>>>>', pre) pre>>>>> [0. 0.14160782 0.99999983]
pre>>>>> [0. 0. 0. 0. 0.62276601 0. 0. 0. 0. 0. 0. 0.4736296 0.62276601 0. 0. 0. 0. ]
import sys
from textmatch.models.text_embedding.w2v import Word2Vec
from textmatch.models.text_embedding.stop_words import StopWords
from textmatch.config.constant import Constant as const
if __name__ == '__main__':
words_list = ["我去玉龙雪山并且喜欢玉龙雪山玉龙雪山","我在玉龙雪山并且喜欢玉龙雪山","我在九寨沟"]
w2v = Word2Vec(w2v_model_file=const.W2V_MODEL_FILE, stop_word=StopWords(stopwords_file=const.STOPWORDS_FILE) )
w2v.init(words_list, update=True)
testword = "我在九寨沟,很喜欢"
pre = w2v.predict(testword)
print ('pre>>>>>', pre) pre>>>>> [0.17818374 0.27095952 0.70004393]
import time
import jieba
import gensim
import threading
import numpy as np
from textmatch.config.constant import Constant as const
# 粗排:使用word mover distance(WMD)来进行初始的排查,最终得分0-0.15的太相似了,0.45-1分的基本不相关,所以从0.15-0.45分钟选择了10%来进行人工标注
# word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
w2v_model_file = const.W2V_MODEL_FILE
w2v_model = gensim.models.Word2Vec.load(w2v_model_file)
w2v_model.init_sims(replace=True) # normalizes vectors
distance = w2v_model.wmdistance("你们是你们哪,你们哪里的。", "你们是哪里,你们是谁?")
print ('distance>>>>', distance)
'''
"你有什么事你说。", "我是他家人/朋友,你有什么事可以给我说?" 0.6694891459671026
"呃,我想提前结清我名下那个款项。", "我需要提前结清" 0.6992085239002946
"你们是你们哪,你们哪里的。", "你们是哪里,你们是谁?" 0.27438064142232443
"嗯,好。", "你们催收人员说要对我上门催收,是不是真的?" 0.948713353219643
"嗯。就是您就是就是。就是您就是您拨打的这个电话。", "你们催收人员说要对我上门催收,是不是真的?" 0.8855274054486878
"提前结清。", "我需要提前结清" 0.5150805852253076
'''