- dt = {'空调': 1, '调和': 1, '和风': 1, '风扇': 1,
- '空': 1, '调': 1, '和': 2, '风': 1, '扇': 1} # 词典
- max_len = max(len(w) for w in dt) # 词最大长度,默认等于词典最长词
- total = sum(dt.values()) # 总频数
- sentence = '空调和风扇'
- length = len(sentence)
-
- def maximum_matching(sentence):
- """正向最大匹配"""
- head = 0
- while head < length:
- tail = min(head + max_len, length)
- for middle in range(tail, head + 1, -1):
- word = sentence[head: middle]
- if word in dt:
- head = middle
- break
- else:
- word = sentence[head]
- head += 1
- yield word
-
- def reverse_maximum_matching(sentence):
- """逆向最大匹配"""
- words = []
- tail = length
- while tail > 0:
- head = min(tail - max_len, 0)
- for middle in range(head, tail - 1):
- word = sentence[middle: tail]
- if word in dt:
- tail = middle
- break
- else:
- tail -= 1
- word = sentence[tail]
- words.append(word) # 比words.insert(0, word)快6%
- return words[::-1]
-
- def probability(sentence):
- """贝叶斯网络"""
- # get DAG
- DAG = dict()
- for head in range(length):
- DAG.update({head: [head]})
- tail = min(head + max_len, length)
- for middle in range(head + 2, tail + 1):
- word = sentence[head: middle]
- if word in dt:
- DAG[head].append(middle - 1)
- # calculate route
- route = {}
- route[length] = (1, 1)
- for idx in range(length - 1, -1, -1):
- route[idx] = max(
- (dt.get(sentence[idx:x + 1], 0) / total * route[x + 1][0], x)
- for x in DAG[idx])
- # yield
- x = 0
- while x < length:
- y = route[x][1] + 1
- l_word = sentence[x:y]
- yield l_word
- x = y
-
- print(list(maximum_matching(sentence)))
- print(reverse_maximum_matching(sentence))
- print(list(probability(sentence)))
-
空调和风扇分词结果
- sentence = '西樵山'
- # 初始化扫描位置,从句尾开始
- tail = len(sentence)
- # 词最大长度
- max_len = 3
- # 逆向扫描
- while tail > 0:
- head = tail - max_len
- if head < 0:
- head = 0
- for middle in range(head, tail):
- word = sentence[middle: tail]
- print(head, middle, tail, word)
- tail -= 1
-
0 0 3 西樵山
0 1 3 樵山
0 2 3 山
0 0 2 西樵
0 1 2 樵
0 0 1 西
- dt = {'桂江二中', '毕业', '二中'} # 词典
- sentence = '桂江二中毕业'
-
- tail = len(sentence)
- max_len = 4
- words = []
- while tail > 0:
- head = max(tail - max_len, 0)
- for middle in range(head, tail - 1): # 忽略长度为1的词
- word = sentence[middle: tail]
- if word in dt:
- print(middle, tail - 1, word)
- words.append(word)
- tail = middle
- break
- else:
- tail -= 1
- print(tail, tail, sentence[tail])
- words.append(sentence[tail])
- print(words[::-1])
-
4 5 毕业
0 3 桂江二中
[‘桂江二中’, ‘毕业’]
- dt = {'空调': 1, '调和': 1, '和风': 1, '风扇': 1,
- '空': 1, '调': 1, '和': 2, '风': 1, '扇': 1} # 词典
- max_len = max(len(w) for w in dt) # 词最大长度,默认等于词典最长词
- total = sum(dt.values()) # 总频数
- sentence = '空调和风扇'
- length = len(sentence)
-
- DAG = dict()
- for head in range(length):
- DAG.update({head: [head]})
- tail = min(head + max_len, length)
- for middle in range(head + 2, tail + 1):
- word = sentence[head: middle]
- if word in dt:
- DAG[head].append(middle - 1)
- print(DAG)
-
{0: [0, 1], 1: [1, 2], 2: [2, 3], 3: [3, 4], 4: [4]}
- route = {}
- route[length] = (1, 1)
- for idx in range(length - 1, -1, -1):
- route[idx] = max(
- (dt.get(sentence[idx:x + 1], 0) / total * route[x + 1][0], x)
- for x in DAG[idx])
- print(route)
-
{5: (1, 1), 4: (0.1, 4), 3: (0.1, 4), 2: (0.02, 2), 1: (0.01, 2), 0: (0.002, 1)}
- from os import path
- import re
- import jieba
- from math import log
- fname = path.join(path.dirname(jieba.__file__), 'dict.txt')
- NA = 'NA'
-
-
- class Tokenizer:
- re_eng = re.compile('[a-zA-Z]+')
- re_m = re.compile('[0-9]+') # jieba数词标注为m
-
- def __init__(self, word2freq, total, word2flag, max_len):
- self.word2freq = word2freq
- self.total = total
- self.word2flag = word2flag
- self.max_len = max_len
-
- @classmethod
- def initialization(cls):
- word2freq, total, word2flag = dict(), 0, dict()
- with open(fname, encoding='utf-8') as f:
- for line in f.read().strip().split('\n'):
- word, freq, flag = line.split()
- freq = int(freq)
- word2freq[word] = freq
- total += freq
- word2flag[word] = flag
- # 词最大长度,默认等于词典最长词(超长英文符会识别不出来)
- max_len = max(len(i) for i in word2freq.keys())
- return cls(word2freq, total, word2flag, max_len)
-
- def _get_DAG(self, sentence):
- length = len(sentence)
- DAG = dict()
- for head in range(length):
- DAG.update({head: [head]})
- tail = min(head + self.max_len, length)
- for middle in range(head + 2, tail + 1):
- word = sentence[head: middle]
- # ------------- 词典 + 正则 ------------- #
- if word in self.word2freq:
- DAG[head].append(middle - 1)
- elif self.re_eng.fullmatch(word):
- DAG[head].append(middle - 1)
- elif self.re_m.fullmatch(word):
- DAG[head].append(middle - 1)
- return DAG
-
- def _calculate(self, sentence):
- DAG = self._get_DAG(sentence)
- length = len(sentence)
- route = dict()
- route[length] = (0, 0)
- logtotal = log(self.total)
- for idx in range(length - 1, -1, -1):
- route[idx] = max(
- (log(self.word2freq.get(sentence[idx:x + 1], 1)) - logtotal + route[x + 1][0], x)
- for x in DAG[idx])
- return route
-
- def cut(self, sentence):
- route = self._calculate(sentence)
- length = len(sentence)
- x = 0
- while x < length:
- y = route[x][1] + 1
- l_word = sentence[x:y]
- yield l_word
- x = y
-
- def lcut(self, sentence):
- return list(self.cut(sentence))
-
- def add_word(self, word, freq=-1, flag=NA):
- if freq >= 0:
- self.del_word(word)
- else:
- freq = 1
- original_freq = self.word2freq.get(word, 0)
- self.word2freq[word] = original_freq + freq
- self.total = self.total - original_freq + self.word2freq[word]
- self.word2flag[word] = flag
-
- def del_word(self, word):
- original_freq = self.word2freq.get(word)
- if original_freq is not None:
- del self.word2freq[word]
- self.total -= original_freq
- del self.word2flag[word]
-
- def cut2position(self, sentence):
- route = self._calculate(sentence)
- x = 0
- length = len(sentence)
- while x < length:
- y = route[x][1] + 1
- l_word = sentence[x:y]
- yield l_word, x, y
- x = y
-
- def get_flag(self, word):
- return self.word2flag.get(word, NA)
-
- def get_flags(self, words):
- if isinstance(words, str):
- words = self.cut(words)
- return [self.get_flag(word) for word in words]
-
-
- # 实例化
- tokenizer = Tokenizer.initialization()
- cut = tokenizer.cut
- lcut = tokenizer.lcut
- add_word = tokenizer.add_word
- del_word = tokenizer.del_word
- cut2position = tokenizer.cut2position
- get_flag = tokenizer.get_flag
- get_flags = tokenizer.get_flags
-
-
- if __name__ == '__main__':
- text = '幻刺斩杀大法师'
- print(lcut(text))
- add_word('幻刺', 2, 'N')
- print(list(cut2position(text)))
- del_word('大法师')
- print(lcut(text))
- print(get_flags(text))
-
- %matplotlib inline
- import networkx as nx
- # 创建图
- G = nx.DiGraph()
- # 添加边
- G.add_edges_from([(0, 1), (0, 2), (1, 2), (2, 3)])
- # 绘图
- nx.draw(G, with_labels=True, font_size=36, node_size=1500, width=4, node_color='lightgreen')
-
- class G:
- def __init__(self, nodes):
- self.matrix = [[0] * nodes for _ in range(nodes)]
- def add_edge(self, start, end, value=1):
- self.matrix[start][end] = value
-
- g = G(4)
- g.add_edge(0, 1)
- g.add_edge(0, 2)
- g.add_edge(1, 2)
- g.add_edge(2, 3)
- print(g.matrix)
-
- class G:
- def __init__(self):
- self.dt = dict()
- def add_edge(self, start, end, value=1):
- self.dt[start] = self.dt.get(start, dict())
- self.dt[start][end] = value
-
- g = G()
- g.add_edge(0, 1)
- g.add_edge(0, 2)
- g.add_edge(1, 2)
- g.add_edge(2, 3)
- print(g.dt)
-