123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- from copy import deepcopy
- import re
- try:
- import psyco
- psyco.full()
- except:
- pass
- try:
- from zh_wiki import zh2Hant, zh2Hans
- except ImportError:
- from zhtools.zh_wiki import zh2Hant, zh2Hans
- import sys
- py3k = sys.version_info >= (3, 0, 0)
- if py3k:
- UEMPTY = ''
- else:
- _zh2Hant, _zh2Hans = {}, {}
- for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):
- for k, v in old.items():
- new[k.decode('utf8')] = v.decode('utf8')
- zh2Hant = _zh2Hant
- zh2Hans = _zh2Hans
- UEMPTY = ''.decode('utf8')
- # states
- (START, END, FAIL, WAIT_TAIL) = list(range(4))
- # conditions
- (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))
- MAPS = {}
- class Node(object):
- def __init__(self, from_word, to_word=None, is_tail=True,
- have_child=False):
- self.from_word = from_word
- if to_word is None:
- self.to_word = from_word
- self.data = (is_tail, have_child, from_word)
- self.is_original = True
- else:
- self.to_word = to_word or from_word
- self.data = (is_tail, have_child, to_word)
- self.is_original = False
- self.is_tail = is_tail
- self.have_child = have_child
- def is_original_long_word(self):
- return self.is_original and len(self.from_word)>1
- def is_follow(self, chars):
- return chars != self.from_word[:-1]
- def __str__(self):
- return '<Node, %s, %s, %s, %s>' % (repr(self.from_word),
- repr(self.to_word), self.is_tail, self.have_child)
- __repr__ = __str__
- class ConvertMap(object):
- def __init__(self, name, mapping=None):
- self.name = name
- self._map = {}
- if mapping:
- self.set_convert_map(mapping)
- def set_convert_map(self, mapping):
- convert_map = {}
- have_child = {}
- max_key_length = 0
- for key in sorted(mapping.keys()):
- if len(key)>1:
- for i in range(1, len(key)):
- parent_key = key[:i]
- have_child[parent_key] = True
- have_child[key] = False
- max_key_length = max(max_key_length, len(key))
- for key in sorted(have_child.keys()):
- convert_map[key] = (key in mapping, have_child[key],
- mapping.get(key, UEMPTY))
- self._map = convert_map
- self.max_key_length = max_key_length
- def __getitem__(self, k):
- try:
- is_tail, have_child, to_word = self._map[k]
- return Node(k, to_word, is_tail, have_child)
- except:
- return Node(k)
- def __contains__(self, k):
- return k in self._map
- def __len__(self):
- return len(self._map)
- class StatesMachineException(Exception): pass
- class StatesMachine(object):
- def __init__(self):
- self.state = START
- self.final = UEMPTY
- self.len = 0
- self.pool = UEMPTY
- def clone(self, pool):
- new = deepcopy(self)
- new.state = WAIT_TAIL
- new.pool = pool
- return new
- def feed(self, char, map):
- node = map[self.pool+char]
- if node.have_child:
- if node.is_tail:
- if node.is_original:
- cond = UNMATCHED_SWITCH
- else:
- cond = MATCHED_SWITCH
- else:
- cond = CONNECTOR
- else:
- if node.is_tail:
- cond = TAIL
- else:
- cond = ERROR
- new = None
- if cond == ERROR:
- self.state = FAIL
- elif cond == TAIL:
- if self.state == WAIT_TAIL and node.is_original_long_word():
- self.state = FAIL
- else:
- self.final += node.to_word
- self.len += 1
- self.pool = UEMPTY
- self.state = END
- elif self.state == START or self.state == WAIT_TAIL:
- if cond == MATCHED_SWITCH:
- new = self.clone(node.from_word)
- self.final += node.to_word
- self.len += 1
- self.state = END
- self.pool = UEMPTY
- elif cond == UNMATCHED_SWITCH or cond == CONNECTOR:
- if self.state == START:
- new = self.clone(node.from_word)
- self.final += node.to_word
- self.len += 1
- self.state = END
- else:
- if node.is_follow(self.pool):
- self.state = FAIL
- else:
- self.pool = node.from_word
- elif self.state == END:
- # END is a new START
- self.state = START
- new = self.feed(char, map)
- elif self.state == FAIL:
- raise StatesMachineException('Translate States Machine '
- 'have error with input data %s' % node)
- return new
- def __len__(self):
- return self.len + 1
- def __str__(self):
- return '<StatesMachine %s, pool: "%s", state: %s, final: %s>' % (
- id(self), self.pool, self.state, self.final)
- __repr__ = __str__
- class Converter(object):
- def __init__(self, to_encoding):
- self.to_encoding = to_encoding
- self.map = MAPS[to_encoding]
- self.start()
- def feed(self, char):
- branches = []
- for fsm in self.machines:
- new = fsm.feed(char, self.map)
- if new:
- branches.append(new)
- if branches:
- self.machines.extend(branches)
- self.machines = [fsm for fsm in self.machines if fsm.state != FAIL]
- all_ok = True
- for fsm in self.machines:
- if fsm.state != END:
- all_ok = False
- if all_ok:
- self._clean()
- return self.get_result()
- def _clean(self):
- if len(self.machines):
- self.machines.sort(key=lambda x: len(x))
- # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
- self.final += self.machines[0].final
- self.machines = [StatesMachine()]
- def start(self):
- self.machines = [StatesMachine()]
- self.final = UEMPTY
- def end(self):
- self.machines = [fsm for fsm in self.machines
- if fsm.state == FAIL or fsm.state == END]
- self._clean()
- def convert(self, string):
- self.start()
- for char in string:
- self.feed(char)
- self.end()
- return self.get_result()
- def get_result(self):
- return self.final
- def registery(name, mapping):
- global MAPS
- MAPS[name] = ConvertMap(name, mapping)
- registery('zh-hant', zh2Hant)
- registery('zh-hans', zh2Hans)
- del zh2Hant, zh2Hans
- def run():
- import sys
- from optparse import OptionParser
- parser = OptionParser()
- parser.add_option('-e', type='string', dest='encoding',
- help='encoding')
- parser.add_option('-f', type='string', dest='file_in',
- help='input file (- for stdin)')
- parser.add_option('-t', type='string', dest='file_out',
- help='output file')
- (options, args) = parser.parse_args()
- if not options.encoding:
- parser.error('encoding must be set')
- if options.file_in:
- if options.file_in == '-':
- file_in = sys.stdin
- else:
- file_in = open(options.file_in)
- else:
- file_in = sys.stdin
- if options.file_out:
- if options.file_out == '-':
- file_out = sys.stdout
- else:
- file_out = open(options.file_out, 'wb')
- else:
- file_out = sys.stdout
- c = Converter(options.encoding)
- for line in file_in:
- # print >> file_out, c.convert(line.rstrip('\n').decode(
- file_out.write(c.convert(line.rstrip('\n').decode(
- 'utf8')).encode('utf8'))
- if __name__ == '__main__':
- run()
|