Add N-gram generator

commit: 178e6dce01ad28c8708bad62ce0fb79c46e836dc [log] [tgz]
author: Behdad Esfahbod <[email protected]> Wed May 09 08:57:29 2012 +0200
committer: Behdad Esfahbod <[email protected]> Wed May 09 08:57:29 2012 +0200
tree: 4a8de5bbaa2c7febccf84cfcacddbe21c08914f6
parent: 98669ceb77657d60435f2cb2e3fc18272c0a2c6a [diff] [blame]
diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index d3c0939..a38f067 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py

@@ -169,6 +169,53 @@
 		total = passed + failed
 		print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
 
+	@staticmethod
+	def print_ngrams (f, ns=(1,2,3)):
+		gens = tuple (Ngram.generator (n) for n in ns)
+		for key, lines in DiffHelpers.separate_test_cases (f):
+			test = Test (lines)
+			unicodes = test.unicodes
+			del test
+
+			for gen in gens:
+				print "Printing %d-grams:" % gen.n
+				for ngram in gen (unicodes):
+					print ngram
+
+
+
+class Test:
+
+	def __init__ (self, lines):
+		self.passed = True
+		self.identifier = None
+		self.text = None
+		self.unicodes = None
+		self.glyphs = None
+		for l in lines:
+			symbol = l[0]
+			if symbol != ' ':
+				self.passed = False
+			i = 1
+			if ':' in l:
+				i = l.index (':')
+				if not self.identifier:
+					self.identifier = l[1:i]
+				i = i + 2 # Skip colon and space
+			j = -1
+			if l[j] == '\n':
+				j -= 1
+			brackets = l[i] + l[j]
+			l = l[i+1:-2]
+			if brackets == '()':
+				self.text = l
+			elif brackets == '<>':
+				self.unicodes = Unicode.parse (l)
+			elif brackets == '[]':
+				# XXX we don't handle failed tests here
+				self.glyphs = l
+
+
 class DiffHelpers:
 
 	@staticmethod
@@ -205,6 +252,23 @@
 		return printer
 
 
+class Ngram:
+
+	@staticmethod
+	def generator (n):
+
+		def gen (f):
+			l = []
+			for x in f:
+				l.append (x)
+				if len (l) == n:
+					yield tuple (l)
+					l[:1] = []
+
+		gen.n = n
+		return gen
+
+
 class UtilMains:
 
 	@staticmethod
@@ -276,10 +340,14 @@
 		return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>'
 
 	@staticmethod
-	def encode (s):
+	def parse (s):
 		s = re.sub (r"[<+>,\\uU\n	]", " ", s)
 		s = re.sub (r"0[xX]", " ", s)
-		return u''.join (unichr (int (x, 16)) for x in s.split (' ') if len (x)).encode ('utf-8')
+		return [int (x, 16) for x in s.split (' ') if len (x)]
+
+	@staticmethod
+	def encode (s):
+		return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
 
 	shorthands = {
 		"ZERO WIDTH NON-JOINER": "ZWNJ",
commit	178e6dce01ad28c8708bad62ce0fb79c46e836dc	[log] [tgz]
author	Behdad Esfahbod <[email protected]>	Wed May 09 08:57:29 2012 +0200
committer	Behdad Esfahbod <[email protected]>	Wed May 09 08:57:29 2012 +0200
tree	4a8de5bbaa2c7febccf84cfcacddbe21c08914f6
parent	98669ceb77657d60435f2cb2e3fc18272c0a2c6a [diff] [blame]