正向最大匹配
# -*- coding:utf-8 -*-
CODEC=\'utf-8\'
def u(s, encoding):
\'converted other encoding to unicode encoding\'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
def fwd_mm_seg(wordDict, maxLen, str):
\'forward max match segment\'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print \'word: \', word
print \"\\n\"
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[0:wordLen]
print \"subStr: \", subStr
while wordLen > 1:
if subStr in wordDict:
print \"subStr1: %r\" % subStr
break
else:
print \"subStr2: %r\" % subStr
wordLen = wordLen - 1
subStr = subStr[0:wordLen]
# print \"subStr3: \", subStr
wordList.append(subStr)
segStr = segStr[wordLen:]
segStrLen = segStrLen - wordLen
for wordstr in wordList:
print \"wordstr: \", wordstr
return wordList
def main():
fp_dict = open(\'words.dic\')
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), \'utf-8\')] = 1
segStr = u\'你好世界hello world\'
print segStr
wordList = fwd_mm_seg(wordDict, 10, segStr)
print \"==\".join(wordList)
if __name__ == \'__main__\':
main()
逆向最大匹配
# -*- coding:utf-8 -*-
def u(s, encoding):
\'converted other encoding to unicode encoding\'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
CODEC=\'utf-8\'
def bwd_mm_seg(wordDict, maxLen, str):
\'forward max match segment\'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print \'word: \', word
print \"\\n\"
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[-wordLen:None]
print \"subStr: \", subStr
while wordLen > 1:
if subStr in wordDict:
print \"subStr1: %r\" % subStr
break
else:
print \"subStr2: %r\" % subStr
wordLen = wordLen - 1
subStr = subStr[-wordLen:None]
# print \"subStr3: \", subStr
wordList.append(subStr)
segStr = segStr[0: -wordLen]
segStrLen = segStrLen - wordLen
wordList.reverse()
for wordstr in wordList:
print \"wordstr: \", wordstr
return wordList
def main():
fp_dict = open(\'words.dic\')
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), \'utf-8\')] = 1
segStr = ur\'你好世界hello world\'
print segStr
wordList = bwd_mm_seg(wordDict, 10, segStr)
print \"==\".join(wordList)
if __name__ == \'__main__\':
main()
以上这篇python正向最大匹配分词和逆向最大匹配分词的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。
继续阅读与本文标签相同的文章
-
谷歌AI模型ML-Jam激发音乐家创作,人机合作或成未来大势!
2026-05-19栏目: 教程
-
OCP-052考试题库汇总(49)-CUUG内部解答版
2026-05-19栏目: 教程
-
自己实现 ECS 购买页需要用到的 API
2026-05-19栏目: 教程
-
Istio从懵圈到熟练 – 二分之一活的微服务
2026-05-19栏目: 教程
-
又一个里程碑,谷歌发布最新研究,机器学习开辟新篇章!
2026-05-19栏目: 教程
