英语单词拼写检查

Contact me

Blog -> https://cugtyt.github.io/blog/index
Email -> cugtyt@qq.com
GitHub -> Cugtyt@GitHub

本系列博客主页及相关见此处

主要参考知乎鹅厂面试题，英语单词拼写检查算法？中南葱的回答和How to Write a Spelling Corrector

一个直接想法是，遍历词典，比较和当前词的编辑距离(leetcode 72 编辑距离)。

这个方法当然是可行的，但是问题是时间复杂度很高，需要遍历整个字典。改进的方法是空间换时间，用树结构保持字典，这样可以极大的减小搜索空间。下面是用trie树的方法：

def make_trie(words):
    trie = {}
    for word in words:
        t = trie
        for c in word:
            if c not in t: t[c] = {}
            t = t[c]
        t[None] = None
    return trie

def check_fuzzy(trie, word, path='', tol=1):
    if word == '':
        return {path} if None in trie else set()
    else:
        p0 = set()
        if word[0] in trie:
            p0 = check_fuzzy(trie[word[0]], word[1:], path+word[0], tol)
        p1 = set()
        if tol > 0:
            for k in trie:
                if k is not None and k != word[0]:
                    p1.update(check_fuzzy(trie[k], word[1:], path+k, tol-1))
        return p0 | p1

p0表示准确查找，p1表示相近查找，这里使用了tol参数，用于表示可以容忍几个近似值。

另外为了支持新的容错如增删字母，可以如下修改：

def check_fuzzy(trie, word, path='', tol=1):
    if tol < 0:
        return set()
    elif word == '':
        results = set()
        if None in trie:
            results.add(path)
        # 增加词尾字母
        for k in trie:
            if k is not None:
                results |= check_fuzzy(trie[k], '', path+k, tol-1)
        return results
    else:
        results = set()
        # 首字母匹配
        if word[0] in trie:
            results |= check_fuzzy(trie[word[0]], word[1:], path + word[0], tol)
        # 分情形继续搜索（相当于保留待探索的回溯分支）
        for k in trie:
            if k is not None and k != word[0]:
                # 用可能正确的字母置换首字母
                results |= check_fuzzy(trie[k], word[1:], path+k, tol-1)
                # 插入可能正确的字母作为首字母
                results |= check_fuzzy(trie[k], word, path+k, tol-1)
        # 跳过余词首字母
        results |= check_fuzzy(trie, word[1:], path, tol-1)
        # 交换原词头两个字母
        if len(word) > 1:
            results |= check_fuzzy(trie, word[1]+word[0]+word[2:], path, tol-1)
        return results

进一步，可以用yield减少不需要的计算，只在有需要的时候继续搜索。

def check_lazy(trie, word, path='', tol=1):
    if tol < 0:
        pass
    elif word == '':
        if None in trie:
            yield path
        # 增加词尾字母
        for k in trie:
            if k is not None:
                yield from check_lazy(trie[k], '', path + k, tol - 1)
    else:
        if word[0] in trie:
            # 首字母匹配成功
            yield from check_lazy(trie[word[0]], word[1:], path+word[0], tol)
        # 分情形继续搜索（相当于保留待探索的回溯分支）
        for k in trie:
            if k is not None and k != word[0]:
                # 用可能正确的字母置换首字母
                yield from check_lazy(trie[k], word[1:], path+k, tol-1)
                # 插入可能正确的字母作为首字母
                yield from check_lazy(trie[k], word, path+k, tol-1)
        # 跳过余词首字母
        yield from check_lazy(trie, word[1:], path, tol-1)
        # 交换原词头两个字母
        if len(word) > 1:
            yield from check_lazy(trie, word[1]+word[0]+word[2:], path, tol-1)

用迭代替换递归：

from collections import deque

def check_iter(trie, word, tol=1):
    seen = set()
    q = deque([(trie, word, '', tol)])
    while q:
        trie, word, path, tol = q.popleft()
        if word == '':
            if None in trie:
                if path not in seen:
                    seen.add(path)
                    yield path
            if tol > 0:
                for k in trie:
                    if k is not None:
                        q.appendleft((trie[k], '', path+k, tol-1))
        else:
            if word[0] in trie:
                q.appendleft((trie[word[0]], word[1:], path+word[0], tol))
            if tol > 0:
                for k in trie.keys():
                    if k is not None and k != word[0]:
                        q.append((trie[k], word[1:], path+k, tol-1))
                        q.append((trie[k], word, path+k, tol-1))
                q.append((trie, word[1:], path, tol-1))
                if len(word) > 1:
                    q.append((trie, word[1]+word[0]+word[2:], path, tol-1))

【可选的一步】纠正的时候往往倾向于认为首字母是无误的，利用这个现象可以减轻不少搜索压力，花费的时间可以少数倍。

def check_head_fixed(trie, word, tol=1):
    for p in check_lazy(trie[word[0]], word[1:], tol=tol):
        yield word[0] + p

另一个是来自How to Write a Spelling Corrector：

思路是对一次编辑距离的候选词和二次编辑距离的候选词进行搜索

import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

使用方法：

>>> correction('speling')
'spelling'

>>> correction('korrectud')
'corrected'