方法:找到源码的变量进行修改
示例:使【BladeMaster】这类中间有空格的词被识别
- import jieba, re
- sentence = 'Blade Master疾风刺杀Archmage'
- jieba.add_word('Blade Master') # 添词
- print([word for word in jieba.cut(sentence)])
- jieba.re_han_default = re.compile('([\u4E00-\u9FD5a-zA-Z0-9+#&\._% -]+)', re.U) # 修改格式
- print([word for word in jieba.cut(sentence)])
-
- import jieba, jieba.posseg as jp, re
- sentence = 'Demon Hunter斩杀大法师'
- jieba.add_word('Demon Hunter', 9, 'hero') # 添词
- jp.re_han_internal = re.compile('(.+)', re.U) # 修改格式
- print(jp.lcut(sentence))
-
- import jieba, jieba.posseg as jp, re
- sentence = 'D H的D H的DH'
- # 修改格式
- jp.re_han_internal = re.compile('(.+)', re.U)
- # 添词
- jieba.add_word('D H')
- jieba.add_word('的', tag='DE')
- # 打印
- print(jp.lcut(sentence))
- print(jp.lcut(sentence, HMM=False))
-
- from jieba import dt
- from jieba.posseg import POSTokenizer
-
- text = '正义的洪基伟斩杀邪恶的巨法师'
-
- pos_dt = POSTokenizer(dt)
- print(pos_dt.lcut(text))
-
- dt.add_word('巨法师', 1, 'DDD')
- print(pos_dt.lcut(text))
-
- dt.add_word('的', tag='DE')
- print(pos_dt.lcut(text))
-
- import jieba
- s = 'apple均价93600'
- print(' '.join(jieba.cut(s)))
- jieba.add_word('360')
- jieba.add_word('app')
- print(' '.join(jieba.cut(s)))
-
- from os import path
- import re
- import jieba
- from math import log
- jieba_dict = path.join(path.dirname(jieba.__file__), 'dict.txt')
-
-
- class Token:
- re_eng = re.compile('[a-zA-Z][a-zA-Z0-9_-]*')
- re_m = re.compile('[0-9][0-9.+%/~-]*') # jieba数词标注为m
-
- def __init__(self, dt, total, max_len):
- self.dt = dt
- self.total = total
- self.max_len = max_len
-
- @classmethod
- def initialization(cls):
- with open(jieba_dict, encoding='utf-8') as f:
- dt = {line.split()[0]: int(line.split()[1]) for line in f.read().strip().split('\n')}
- # 总频数
- total = sum(dt.values())
- # 词最大长度,默认等于词典最长词(超长英文符会识别不出来)
- max_len = max(len(i) for i in dt.keys())
- return cls(dt, total, max_len)
-
- def _get_DAG(self, sentence):
- length = len(sentence)
- dt = dict()
- for head in range(length):
- tail = min(head + self.max_len, length)
- dt.update({head: [head]})
- for middle in range(head + 2, tail + 1):
- word = sentence[head: middle]
- # ------------- 词典 + 正则 ------------- #
- if word in self.dt:
- dt[head].append(middle - 1)
- elif self.re_eng.fullmatch(word):
- dt[head].append(middle - 1)
- elif self.re_m.fullmatch(word):
- dt[head].append(middle - 1)
- return dt
-
- def _calculate(self, sentence):
- DAG = self._get_DAG(sentence)
- route = dict()
- N = len(sentence)
- route[N] = (0, 0)
- logtotal = log(self.total)
- for idx in range(N - 1, -1, -1):
- route[idx] = max(
- (log(self.dt.get(sentence[idx:x + 1], 1)) - logtotal + route[x + 1][0], x)
- for x in DAG[idx])
- return route
-
- def cut(self, sentence):
- route = self._calculate(sentence)
- x = 0
- N = len(sentence)
- while x < N:
- y = route[x][1] + 1
- l_word = sentence[x:y]
- yield l_word
- x = y
-
- def lcut(self, sentence):
- return list(self.cut(sentence))
-
- def add_word(self, word, freq=1):
- original_freq = self.dt.get(word, 0)
- self.dt[word] = freq
- self.total = self.total - original_freq + freq
-
- def del_word(self, word):
- original_freq = self.dt.get(word)
- if original_freq is not None:
- del self.dt[word]
- self.total -= original_freq
-
-
- tokenizer = Token.initialization()
- cut = tokenizer.cut
- lcut = tokenizer.lcut
- add_word = tokenizer.add_word
- del_word = tokenizer.del_word
-
-
- if __name__ == '__main__':
- s = '小米60r价值3660rmb'
- print(' '.join(jieba.cut(s)))
- jieba.add_word('60r')
- print(' '.join(jieba.cut(s)))
- print('——')
- print(' '.join(cut(s)))
- add_word('60r')
- print(' '.join(cut(s)))
-
- from jieba import dt
- print(dt.FREQ)
- with open('a.txt', 'w', encoding='utf-8') as f:
- f.write('柳梦璃 99 nr')
- dt.load_userdict('a.txt')
- print('柳梦璃', dt.FREQ['柳梦璃'])
- print('不', dt.FREQ['不'])
- dt.add_word('不')
- print('不', dt.FREQ['不'])
- dt.add_word('不', 9)
- print('不', dt.FREQ['不'])
-
dt.FREQ初始化前是空字典{}
初始化的时候会打印一大串红色文字,初始化完成后,dt.FREQ才不为空