langconv.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from copy import deepcopy
  4. import re
  5. try:
  6. import psyco
  7. psyco.full()
  8. except:
  9. pass
  10. try:
  11. from zh_wiki import zh2Hant, zh2Hans
  12. except ImportError:
  13. from zhtools.zh_wiki import zh2Hant, zh2Hans
  14. import sys
  15. py3k = sys.version_info >= (3, 0, 0)
  16. if py3k:
  17. UEMPTY = ''
  18. else:
  19. _zh2Hant, _zh2Hans = {}, {}
  20. for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):
  21. for k, v in old.items():
  22. new[k.decode('utf8')] = v.decode('utf8')
  23. zh2Hant = _zh2Hant
  24. zh2Hans = _zh2Hans
  25. UEMPTY = ''.decode('utf8')
  26. # states
  27. (START, END, FAIL, WAIT_TAIL) = list(range(4))
  28. # conditions
  29. (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))
  30. MAPS = {}
  31. class Node(object):
  32. def __init__(self, from_word, to_word=None, is_tail=True,
  33. have_child=False):
  34. self.from_word = from_word
  35. if to_word is None:
  36. self.to_word = from_word
  37. self.data = (is_tail, have_child, from_word)
  38. self.is_original = True
  39. else:
  40. self.to_word = to_word or from_word
  41. self.data = (is_tail, have_child, to_word)
  42. self.is_original = False
  43. self.is_tail = is_tail
  44. self.have_child = have_child
  45. def is_original_long_word(self):
  46. return self.is_original and len(self.from_word)>1
  47. def is_follow(self, chars):
  48. return chars != self.from_word[:-1]
  49. def __str__(self):
  50. return '<Node, %s, %s, %s, %s>' % (repr(self.from_word),
  51. repr(self.to_word), self.is_tail, self.have_child)
  52. __repr__ = __str__
  53. class ConvertMap(object):
  54. def __init__(self, name, mapping=None):
  55. self.name = name
  56. self._map = {}
  57. if mapping:
  58. self.set_convert_map(mapping)
  59. def set_convert_map(self, mapping):
  60. convert_map = {}
  61. have_child = {}
  62. max_key_length = 0
  63. for key in sorted(mapping.keys()):
  64. if len(key)>1:
  65. for i in range(1, len(key)):
  66. parent_key = key[:i]
  67. have_child[parent_key] = True
  68. have_child[key] = False
  69. max_key_length = max(max_key_length, len(key))
  70. for key in sorted(have_child.keys()):
  71. convert_map[key] = (key in mapping, have_child[key],
  72. mapping.get(key, UEMPTY))
  73. self._map = convert_map
  74. self.max_key_length = max_key_length
  75. def __getitem__(self, k):
  76. try:
  77. is_tail, have_child, to_word = self._map[k]
  78. return Node(k, to_word, is_tail, have_child)
  79. except:
  80. return Node(k)
  81. def __contains__(self, k):
  82. return k in self._map
  83. def __len__(self):
  84. return len(self._map)
  85. class StatesMachineException(Exception): pass
  86. class StatesMachine(object):
  87. def __init__(self):
  88. self.state = START
  89. self.final = UEMPTY
  90. self.len = 0
  91. self.pool = UEMPTY
  92. def clone(self, pool):
  93. new = deepcopy(self)
  94. new.state = WAIT_TAIL
  95. new.pool = pool
  96. return new
  97. def feed(self, char, map):
  98. node = map[self.pool+char]
  99. if node.have_child:
  100. if node.is_tail:
  101. if node.is_original:
  102. cond = UNMATCHED_SWITCH
  103. else:
  104. cond = MATCHED_SWITCH
  105. else:
  106. cond = CONNECTOR
  107. else:
  108. if node.is_tail:
  109. cond = TAIL
  110. else:
  111. cond = ERROR
  112. new = None
  113. if cond == ERROR:
  114. self.state = FAIL
  115. elif cond == TAIL:
  116. if self.state == WAIT_TAIL and node.is_original_long_word():
  117. self.state = FAIL
  118. else:
  119. self.final += node.to_word
  120. self.len += 1
  121. self.pool = UEMPTY
  122. self.state = END
  123. elif self.state == START or self.state == WAIT_TAIL:
  124. if cond == MATCHED_SWITCH:
  125. new = self.clone(node.from_word)
  126. self.final += node.to_word
  127. self.len += 1
  128. self.state = END
  129. self.pool = UEMPTY
  130. elif cond == UNMATCHED_SWITCH or cond == CONNECTOR:
  131. if self.state == START:
  132. new = self.clone(node.from_word)
  133. self.final += node.to_word
  134. self.len += 1
  135. self.state = END
  136. else:
  137. if node.is_follow(self.pool):
  138. self.state = FAIL
  139. else:
  140. self.pool = node.from_word
  141. elif self.state == END:
  142. # END is a new START
  143. self.state = START
  144. new = self.feed(char, map)
  145. elif self.state == FAIL:
  146. raise StatesMachineException('Translate States Machine '
  147. 'have error with input data %s' % node)
  148. return new
  149. def __len__(self):
  150. return self.len + 1
  151. def __str__(self):
  152. return '<StatesMachine %s, pool: "%s", state: %s, final: %s>' % (
  153. id(self), self.pool, self.state, self.final)
  154. __repr__ = __str__
  155. class Converter(object):
  156. def __init__(self, to_encoding):
  157. self.to_encoding = to_encoding
  158. self.map = MAPS[to_encoding]
  159. self.start()
  160. def feed(self, char):
  161. branches = []
  162. for fsm in self.machines:
  163. new = fsm.feed(char, self.map)
  164. if new:
  165. branches.append(new)
  166. if branches:
  167. self.machines.extend(branches)
  168. self.machines = [fsm for fsm in self.machines if fsm.state != FAIL]
  169. all_ok = True
  170. for fsm in self.machines:
  171. if fsm.state != END:
  172. all_ok = False
  173. if all_ok:
  174. self._clean()
  175. return self.get_result()
  176. def _clean(self):
  177. if len(self.machines):
  178. self.machines.sort(key=lambda x: len(x))
  179. # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
  180. self.final += self.machines[0].final
  181. self.machines = [StatesMachine()]
  182. def start(self):
  183. self.machines = [StatesMachine()]
  184. self.final = UEMPTY
  185. def end(self):
  186. self.machines = [fsm for fsm in self.machines
  187. if fsm.state == FAIL or fsm.state == END]
  188. self._clean()
  189. def convert(self, string):
  190. self.start()
  191. for char in string:
  192. self.feed(char)
  193. self.end()
  194. return self.get_result()
  195. def get_result(self):
  196. return self.final
  197. def registery(name, mapping):
  198. global MAPS
  199. MAPS[name] = ConvertMap(name, mapping)
  200. registery('zh-hant', zh2Hant)
  201. registery('zh-hans', zh2Hans)
  202. del zh2Hant, zh2Hans
  203. def run():
  204. import sys
  205. from optparse import OptionParser
  206. parser = OptionParser()
  207. parser.add_option('-e', type='string', dest='encoding',
  208. help='encoding')
  209. parser.add_option('-f', type='string', dest='file_in',
  210. help='input file (- for stdin)')
  211. parser.add_option('-t', type='string', dest='file_out',
  212. help='output file')
  213. (options, args) = parser.parse_args()
  214. if not options.encoding:
  215. parser.error('encoding must be set')
  216. if options.file_in:
  217. if options.file_in == '-':
  218. file_in = sys.stdin
  219. else:
  220. file_in = open(options.file_in)
  221. else:
  222. file_in = sys.stdin
  223. if options.file_out:
  224. if options.file_out == '-':
  225. file_out = sys.stdout
  226. else:
  227. file_out = open(options.file_out, 'wb')
  228. else:
  229. file_out = sys.stdout
  230. c = Converter(options.encoding)
  231. for line in file_in:
  232. # print >> file_out, c.convert(line.rstrip('\n').decode(
  233. file_out.write(c.convert(line.rstrip('\n').decode(
  234. 'utf8')).encode('utf8'))
  235. if __name__ == '__main__':
  236. run()