1
0
mirror of https://github.com/Jermolene/TiddlyWiki5 synced 2024-11-24 10:37:20 +00:00
TiddlyWiki5/node_modules/esprima/tools/generate-unicode-regex.py
2012-05-07 09:43:29 +01:00

165 lines
4.8 KiB
Python

#!/usr/bin/python
# -*- coding: utf-8 -*-
# By Yusuke Suzuki <utatane.tea@gmail.com>
# Modified by Mathias Bynens <http://mathiasbynens.be/>
# http://code.google.com/p/esprima/issues/detail?id=110
import sys
import string
import re
class RegExpGenerator(object):
def __init__(self, detector):
self.detector = detector
def generate_identifier_start(self):
r = [ ch for ch in range(0xFFFF + 1) if self.detector.is_identifier_start(ch)]
return self._generate_range(r)
def generate_identifier_part(self):
r = [ ch for ch in range(0xFFFF + 1) if self.detector.is_identifier_part(ch)]
return self._generate_range(r)
def generate_non_ascii_identifier_start(self):
r = [ ch for ch in xrange(0x0080, 0xFFFF + 1) if self.detector.is_identifier_start(ch)]
return self._generate_range(r)
def generate_non_ascii_identifier_part(self):
r = [ ch for ch in range(0x0080, 0xFFFF + 1) if self.detector.is_identifier_part(ch)]
return self._generate_range(r)
def generate_non_ascii_separator_space(self):
r = [ ch for ch in range(0x0080, 0xFFFF + 1) if self.detector.is_separator_space(ch)]
return self._generate_range(r)
def _generate_range(self, r):
if len(r) == 0:
return '[]'
buf = []
start = r[0]
end = r[0]
predict = start + 1
r = r[1:]
for code in r:
if predict == code:
end = code
predict = code + 1
continue
else:
if start == end:
buf.append("\\u%04X" % start)
elif end == start + 1:
buf.append("\\u%04X\\u%04X" % (start, end))
else:
buf.append("\\u%04X-\\u%04X" % (start, end))
start = code
end = code
predict = code + 1
if start == end:
buf.append("\\u%04X" % start)
else:
buf.append("\\u%04X-\\u%04X" % (start, end))
return '[' + ''.join(buf) + ']'
class Detector(object):
def __init__(self, data):
self.data = data
def is_ascii(self, ch):
return ch < 0x80
def is_ascii_alpha(self, ch):
v = ch | 0x20
return v >= ord('a') and v <= ord('z')
def is_decimal_digit(self, ch):
return ch >= ord('0') and ch <= ord('9')
def is_octal_digit(self, ch):
return ch >= ord('0') and ch <= ord('7')
def is_hex_digit(self, ch):
v = ch | 0x20
return self.is_decimal_digit(c) or (v >= ord('a') and v <= ord('f'))
def is_digit(self, ch):
return self.is_decimal_digit(ch) or self.data[ch] == 'Nd'
def is_ascii_alphanumeric(self, ch):
return self.is_decimal_digit(ch) or self.is_ascii_alpha(ch)
def _is_non_ascii_identifier_start(self, ch):
c = self.data[ch]
return c == 'Lu' or c == 'Ll' or c == 'Lt' or c == 'Lm' or c == 'Lo' or c == 'Nl'
def _is_non_ascii_identifier_part(self, ch):
c = self.data[ch]
return c == 'Lu' or c == 'Ll' or c == 'Lt' or c == 'Lm' or c == 'Lo' or c == 'Nl' or c == 'Mn' or c == 'Mc' or c == 'Nd' or c == 'Pc' or ch == 0x200C or ch == 0x200D
def is_separator_space(self, ch):
return self.data[ch] == 'Zs'
def is_white_space(self, ch):
return ch == ord(' ') or ch == ord("\t") or ch == 0xB or ch == 0xC or ch == 0x00A0 or ch == 0xFEFF or self.is_separator_space(ch)
def is_line_terminator(self, ch):
return ch == 0x000D or ch == 0x000A or self.is_line_or_paragraph_terminator(ch)
def is_line_or_paragraph_terminator(self, ch):
return ch == 0x2028 or ch == 0x2029
def is_identifier_start(self, ch):
if self.is_ascii(ch):
return ch == ord('$') or ch == ord('_') or ch == ord('\\') or self.is_ascii_alpha(ch)
return self._is_non_ascii_identifier_start(ch)
def is_identifier_part(self, ch):
if self.is_ascii(ch):
return ch == ord('$') or ch == ord('_') or ch == ord('\\') or self.is_ascii_alphanumeric(ch)
return self._is_non_ascii_identifier_part(ch)
def analyze(source):
data = []
dictionary = {}
with open(source) as uni:
flag = False
first = 0
for line in uni:
d = string.split(line.strip(), ";")
val = int(d[0], 16)
if flag:
if re.compile("<.+, Last>").match(d[1]):
# print "%s : u%X" % (d[1], val)
flag = False
for t in range(first, val+1):
dictionary[t] = str(d[2])
else:
raise "Database Exception"
else:
if re.compile("<.+, First>").match(d[1]):
# print "%s : u%X" % (d[1], val)
flag = True
first = val
else:
dictionary[val] = str(d[2])
for i in range(0xFFFF + 1):
if dictionary.get(i) == None:
data.append("Un")
else:
data.append(dictionary[i])
return RegExpGenerator(Detector(data))
def main(source):
generator = analyze(source)
print generator.generate_non_ascii_identifier_start()
print generator.generate_non_ascii_identifier_part()
print generator.generate_non_ascii_separator_space()
if __name__ == '__main__':
main(sys.argv[1])