词相当于一种固定搭配
内部凝固度(互信息)
公式
左熵、右熵
判断组合左右的混乱程度,如果左、右熵都很大,表明组合左右变化很大,证明这个组合是一个词的概率很大
公式:
代码:
import math
from collections import defaultdict
class NewWordDetect:
def __init__(self, corpus_path):
self.max_word_length = 5
self.word_count = defaultdict(int) # 统计1到4个字各有多少种组合
self.left_neighbor = defaultdict(dict) # 统计每个组合的左邻
self.right_neighbor = defaultdict(dict)
self.load_corpus(corpus_path) # 加载文本
self.calc_pmi() # 计算互信息/内部凝固度
self.calc_entropy() # 计算左右熵
self.calc_word_values() # 统计
# 加载语料数据,并进行统计
def load_corpus(self, path):
with open(path, encoding="utf8") as f:
for line in f:
sentence = line.strip()
for word_length in range(1, self.max_word_length):
self.ngram_count(sentence, word_length) # 按行统计每个词长下的词数量
return
# 按照窗口长度取词,并记录左邻右邻
def ngram_count(sentence, word_length):
for i in range(len(sentence) - word_length + 1):
word = sentence[i:i+word_length]
self.word_count[word] += 1
if i-1>=0: # 此时有左邻
char = sentence[i-1]
self.left_neighbor[word][char] = self.left_neighbor.get(char, 0) + 1
if i+word_length<len(sentence): # 此时有右邻
char = sentence[i+word_length]
self.right_neighbor[word][char] = self.right_neighbor.get(char, 0) + 1
return
# 计算左右熵
def calc_entropy(self):
self.word_left_entropy = {} # 保存每个词的左熵
self.word_right_entropy = {}
for word, count_dict in self.left_neighbor.items():
self.word_left_entropy[word] = self.calc_entropy_by_word_count_dict(count_dict)
for word, count_dict in self.right_neighbor.items():
self.word_right_entropy[word] = self.calc_entropy_by_word_count_dict(count_dict)
# 计算熵
def calc_entropy_by_word_count_dict(self, word_count_dict):
total = sum(word_count_dict.values()) # 统计左/右邻一共多少字
entropy = sum([-(c / total) * math.log((c / total), 10) for c in word_count_dict.values()])
return entropy
# 计算每种词长下的词总数
def calc_total_count_by_length(self):
self.word_count_by_length = defaultdict(int)
for word, count in self.word_count.items():
self.word_count_by_length[len(word)] += count
return
# 计算互信息
def calc_pmi(self):
self.calc_total_count_by_length() # 计算每种词长下的词总数
self.pmi = {}
for word, count in self.word_count.items():
p_word = count / self.word_count_by_length[len(word)]
p_chars = 1
for char in word:
p_chars *= self.word_count[char] / self.word_count_by_length[1] # 计算单个字的概率,多个字时求概率乘积
self.pmi[word] = math.log(p_word / p_chars, 10) / len(word)
return
def calc_word_values(self):
self.word_values = {}
# print(self.pmi)
for word in self.pmi:
if len(word) < 2 or "," in word:
continue
pmi = self.pmi.get(word, 1e-3)
le = self.word_left_entropy.get(word, 1e-3)
re = self.word_right_entropy.get(word, 1e-3)
self.word_values[word] = pmi * le * re
if __name__ == '__main__':
nwd = NewWordDetect("sample_corpus.txt")
value_sort = sorted([(word, count) for word, count in nwd.word_values.items()], key=lambda x:x[1], reverse=True)
print([x for x, c in value_sort if len(x) == 2][:10]) # 词长度为2的概率最高的前10个词