方法:找到源码的变量进行修改
示例:使【BladeMaster】这类中间有空格的词被识别
import jieba, re
sentence = 'Blade Master疾风刺杀Archmage'
jieba.add_word('Blade Master') # 添词
print([word for word in jieba.cut(sentence)])
jieba.re_han_default = re.compile('([\u4E00-\u9FD5a-zA-Z0-9+#&\._% -]+)', re.U) # 修改格式
print([word for word in jieba.cut(sentence)])
import jieba, jieba.posseg as jp, re
sentence = 'Demon Hunter斩杀大法师'
jieba.add_word('Demon Hunter', 9, 'hero') # 添词
jp.re_han_internal = re.compile('(.+)', re.U) # 修改格式
print(jp.lcut(sentence))
import jieba, jieba.posseg as jp, re
sentence = 'D H的D H的DH'
# 修改格式
jp.re_han_internal = re.compile('(.+)', re.U)
# 添词
jieba.add_word('D H')
jieba.add_word('的', tag='DE')
# 打印
print(jp.lcut(sentence))
print(jp.lcut(sentence, HMM=False))
from jieba import dt
from jieba.posseg import POSTokenizer
text = '正义的洪基伟斩杀邪恶的巨法师'
pos_dt = POSTokenizer(dt)
print(pos_dt.lcut(text))
dt.add_word('巨法师', 1, 'DDD')
print(pos_dt.lcut(text))
dt.add_word('的', tag='DE')
print(pos_dt.lcut(text))
import jieba
s = 'apple均价93600'
print(' '.join(jieba.cut(s)))
jieba.add_word('360')
jieba.add_word('app')
print(' '.join(jieba.cut(s)))
from os import path
import re
import jieba
from math import log
jieba_dict = path.join(path.dirname(jieba.__file__), 'dict.txt')
class Token:
re_eng = re.compile('[a-zA-Z][a-zA-Z0-9_-]*')
re_m = re.compile('[0-9][0-9.+%/~-]*') # jieba数词标注为m
def __init__(self, dt, total, max_len):
self.dt = dt
self.total = total
self.max_len = max_len
@classmethod
def initialization(cls):
with open(jieba_dict, encoding='utf-8') as f:
dt = {line.split()[0]: int(line.split()[1]) for line in f.read().strip().split('\n')}
# 总频数
total = sum(dt.values())
# 词最大长度,默认等于词典最长词(超长英文符会识别不出来)
max_len = max(len(i) for i in dt.keys())
return cls(dt, total, max_len)
def _get_DAG(self, sentence):
length = len(sentence)
dt = dict()
for head in range(length):
tail = min(head + self.max_len, length)
dt.update({head: [head]})
for middle in range(head + 2, tail + 1):
word = sentence[head: middle]
# ------------- 词典 + 正则 ------------- #
if word in self.dt:
dt[head].append(middle - 1)
elif self.re_eng.fullmatch(word):
dt[head].append(middle - 1)
elif self.re_m.fullmatch(word):
dt[head].append(middle - 1)
return dt
def _calculate(self, sentence):
DAG = self._get_DAG(sentence)
route = dict()
N = len(sentence)
route[N] = (0, 0)
logtotal = log(self.total)
for idx in range(N - 1, -1, -1):
route[idx] = max(
(log(self.dt.get(sentence[idx:x + 1], 1)) - logtotal + route[x + 1][0], x)
for x in DAG[idx])
return route
def cut(self, sentence):
route = self._calculate(sentence)
x = 0
N = len(sentence)
while x < N:
y = route[x][1] + 1
l_word = sentence[x:y]
yield l_word
x = y
def lcut(self, sentence):
return list(self.cut(sentence))
def add_word(self, word, freq=1):
original_freq = self.dt.get(word, 0)
self.dt[word] = freq
self.total = self.total - original_freq + freq
def del_word(self, word):
original_freq = self.dt.get(word)
if original_freq is not None:
del self.dt[word]
self.total -= original_freq
tokenizer = Token.initialization()
cut = tokenizer.cut
lcut = tokenizer.lcut
add_word = tokenizer.add_word
del_word = tokenizer.del_word
if __name__ == '__main__':
s = '小米60r价值3660rmb'
print(' '.join(jieba.cut(s)))
jieba.add_word('60r')
print(' '.join(jieba.cut(s)))
print('——')
print(' '.join(cut(s)))
add_word('60r')
print(' '.join(cut(s)))
from jieba import dt
print(dt.FREQ)
with open('a.txt', 'w', encoding='utf-8') as f:
f.write('柳梦璃 99 nr')
dt.load_userdict('a.txt')
print('柳梦璃', dt.FREQ['柳梦璃'])
print('不', dt.FREQ['不'])
dt.add_word('不')
print('不', dt.FREQ['不'])
dt.add_word('不', 9)
print('不', dt.FREQ['不'])
dt.FREQ初始化前是空字典{}
初始化的时候会打印一大串红色文字,初始化完成后,dt.FREQ才不为空