Python-基于词典-中文分词算法

时间：07-01来源：作者：点击数：29

三种分词算法比较

dt = {'空调': 1, '调和': 1, '和风': 1, '风扇': 1,
      '空': 1, '调': 1, '和': 2, '风': 1, '扇': 1}  # 词典
max_len = max(len(w) for w in dt)  # 词最大长度，默认等于词典最长词
total = sum(dt.values())  # 总频数
sentence = '空调和风扇'
length = len(sentence)

def maximum_matching(sentence):
    """正向最大匹配"""
    head = 0
    while head < length:
        tail = min(head + max_len, length)
        for middle in range(tail, head + 1, -1):
            word = sentence[head: middle]
            if word in dt:
                head = middle
                break
        else:
            word = sentence[head]
            head += 1
        yield word

def reverse_maximum_matching(sentence):
    """逆向最大匹配"""
    words = []
    tail = length
    while tail > 0:
        head = min(tail - max_len, 0)
        for middle in range(head, tail - 1):
            word = sentence[middle: tail]
            if word in dt:
                tail = middle
                break
        else:
            tail -= 1
            word = sentence[tail]
        words.append(word)  # 比words.insert(0, word)快6%
    return words[::-1]

def probability(sentence):
    """贝叶斯网络"""
    # get DAG
    DAG = dict()
    for head in range(length):
        DAG.update({head: [head]})
        tail = min(head + max_len, length)
        for middle in range(head + 2, tail + 1):
            word = sentence[head: middle]
            if word in dt:
                DAG[head].append(middle - 1)
    # calculate route
    route = {}
    route[length] = (1, 1)
    for idx in range(length - 1, -1, -1):
        route[idx] = max(
            (dt.get(sentence[idx:x + 1], 0) / total * route[x + 1][0], x)
            for x in DAG[idx])
    # yield
    x = 0
    while x < length:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        yield l_word
        x = y

print(list(maximum_matching(sentence)))
print(reverse_maximum_matching(sentence))
print(list(probability(sentence)))

空调和风扇分词结果

[‘空调’, ‘和风’, ‘扇’]
[‘空’, ‘调和’, ‘风扇’]
[‘空调’, ‘和’, ‘风扇’]

逆向最大匹配

从后往前扫描

sentence = '西樵山'
# 初始化扫描位置，从句尾开始
tail = len(sentence)
# 词最大长度
max_len = 3
# 逆向扫描
while tail > 0:
    head = tail - max_len
    if head < 0:
        head = 0
    for middle in range(head, tail):
        word = sentence[middle: tail]
        print(head, middle, tail, word)
    tail -= 1

0 0 3 西樵山

0 1 3 樵山

0 2 3 山

0 0 2 西樵

0 1 2 樵

0 0 1 西

词典匹配

dt = {'桂江二中', '毕业', '二中'}  # 词典
sentence = '桂江二中毕业'

tail = len(sentence)
max_len = 4
words = []
while tail > 0:
    head = max(tail - max_len, 0)
    for middle in range(head, tail - 1):  # 忽略长度为1的词
        word = sentence[middle: tail]
        if word in dt:
            print(middle, tail - 1, word)
            words.append(word)
            tail = middle
            break
    else:
        tail -= 1
        print(tail, tail, sentence[tail])
        words.append(sentence[tail])
print(words[::-1])

4 5 毕业

0 3 桂江二中

[‘桂江二中’, ‘毕业’]

概率分词

原理

基于词典，对句子进行词图扫描，生成所有成词情况所构成的有向无环图（Directed Acyclic Graph）
根据DAG，反向计算最大概率路径（使用动态规划算法）
根据路径获取最大概率的分词序列

dt = {'空调': 1, '调和': 1, '和风': 1, '风扇': 1,
      '空': 1, '调': 1, '和': 2, '风': 1, '扇': 1}  # 词典
max_len = max(len(w) for w in dt)  # 词最大长度，默认等于词典最长词
total = sum(dt.values())  # 总频数
sentence = '空调和风扇'
length = len(sentence)

DAG

DAG = dict()
for head in range(length):
    DAG.update({head: [head]})
    tail = min(head + max_len, length)
    for middle in range(head + 2, tail + 1):
        word = sentence[head: middle]
        if word in dt:
            DAG[head].append(middle - 1)
print(DAG)

{0: [0, 1], 1: [1, 2], 2: [2, 3], 3: [3, 4], 4: [4]}

计算大概率路径

route = {}
route[length] = (1, 1)
for idx in range(length - 1, -1, -1):
    route[idx] = max(
        (dt.get(sentence[idx:x + 1], 0) / total * route[x + 1][0], x)
        for x in DAG[idx])
print(route)

{5: (1, 1), 4: (0.1, 4), 3: (0.1, 4), 2: (0.02, 2), 1: (0.01, 2), 0: (0.002, 1)}

封装

from os import path
import re
import jieba
from math import log
fname = path.join(path.dirname(jieba.__file__), 'dict.txt')
NA = 'NA'


class Tokenizer:
    re_eng = re.compile('[a-zA-Z]+')
    re_m = re.compile('[0-9]+')  # jieba数词标注为m

    def __init__(self, word2freq, total, word2flag, max_len):
        self.word2freq = word2freq
        self.total = total
        self.word2flag = word2flag
        self.max_len = max_len

    @classmethod
    def initialization(cls):
        word2freq, total, word2flag = dict(), 0, dict()
        with open(fname, encoding='utf-8') as f:
            for line in f.read().strip().split('\n'):
                word, freq, flag = line.split()
                freq = int(freq)
                word2freq[word] = freq
                total += freq
                word2flag[word] = flag
        # 词最大长度，默认等于词典最长词（超长英文符会识别不出来）
        max_len = max(len(i) for i in word2freq.keys())
        return cls(word2freq, total, word2flag, max_len)

    def _get_DAG(self, sentence):
        length = len(sentence)
        DAG = dict()
        for head in range(length):
            DAG.update({head: [head]})
            tail = min(head + self.max_len, length)
            for middle in range(head + 2, tail + 1):
                word = sentence[head: middle]
                # ------------- 词典 + 正则 ------------- #
                if word in self.word2freq:
                    DAG[head].append(middle - 1)
                elif self.re_eng.fullmatch(word):
                    DAG[head].append(middle - 1)
                elif self.re_m.fullmatch(word):
                    DAG[head].append(middle - 1)
        return DAG

    def _calculate(self, sentence):
        DAG = self._get_DAG(sentence)
        length = len(sentence)
        route = dict()
        route[length] = (0, 0)
        logtotal = log(self.total)
        for idx in range(length - 1, -1, -1):
            route[idx] = max(
                (log(self.word2freq.get(sentence[idx:x + 1], 1)) - logtotal + route[x + 1][0], x)
                for x in DAG[idx])
        return route

    def cut(self, sentence):
        route = self._calculate(sentence)
        length = len(sentence)
        x = 0
        while x < length:
            y = route[x][1] + 1
            l_word = sentence[x:y]
            yield l_word
            x = y

    def lcut(self, sentence):
        return list(self.cut(sentence))

    def add_word(self, word, freq=-1, flag=NA):
        if freq >= 0:
            self.del_word(word)
        else:
            freq = 1
        original_freq = self.word2freq.get(word, 0)
        self.word2freq[word] = original_freq + freq
        self.total = self.total - original_freq + self.word2freq[word]
        self.word2flag[word] = flag

    def del_word(self, word):
        original_freq = self.word2freq.get(word)
        if original_freq is not None:
            del self.word2freq[word]
            self.total -= original_freq
            del self.word2flag[word]

    def cut2position(self, sentence):
        route = self._calculate(sentence)
        x = 0
        length = len(sentence)
        while x < length:
            y = route[x][1] + 1
            l_word = sentence[x:y]
            yield l_word, x, y
            x = y

    def get_flag(self, word):
        return self.word2flag.get(word, NA)

    def get_flags(self, words):
        if isinstance(words, str):
            words = self.cut(words)
        return [self.get_flag(word) for word in words]


# 实例化
tokenizer = Tokenizer.initialization()
cut = tokenizer.cut
lcut = tokenizer.lcut
add_word = tokenizer.add_word
del_word = tokenizer.del_word
cut2position = tokenizer.cut2position
get_flag = tokenizer.get_flag
get_flags = tokenizer.get_flags


if __name__ == '__main__':
    text = '幻刺斩杀大法师'
    print(lcut(text))
    add_word('幻刺', 2, 'N')
    print(list(cut2position(text)))
    del_word('大法师')
    print(lcut(text))
    print(get_flags(text))

图论知识补充

图的表示方法

networkx

%matplotlib inline
import networkx as nx
# 创建图
G = nx.DiGraph()
# 添加边
G.add_edges_from([(0, 1), (0, 2), (1, 2), (2, 3)])
# 绘图
nx.draw(G, with_labels=True, font_size=36, node_size=1500, width=4, node_color='lightgreen')

矩阵

class G:
    def __init__(self, nodes):
        self.matrix = [[0] * nodes for _ in range(nodes)]
    def add_edge(self, start, end, value=1):
        self.matrix[start][end] = value

g = G(4)
g.add_edge(0, 1)
g.add_edge(0, 2)
g.add_edge(1, 2)
g.add_edge(2, 3)
print(g.matrix)

字典

class G:
    def __init__(self):
        self.dt = dict()
    def add_edge(self, start, end, value=1):
        self.dt[start] = self.dt.get(start, dict())
        self.dt[start][end] = value

g = G()
g.add_edge(0, 1)
g.add_edge(0, 2)
g.add_edge(1, 2)
g.add_edge(2, 3)
print(g.dt)