tokeniser.py 3 KB
 Alexander Young committed Mar 05, 2017 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 ``````import re class NainokamiTokeniser: def __init__(self, snippet): self._lines = snippet.splitlines() def process(self): grams = [] for x in self._lines: tokens = self._tokenise(x) grams += tokens grams += self._two_grams(tokens) grams += self._three_grams(tokens) grams += self._four_grams(tokens) return grams def _two_grams(self, tokens): two_grams = [] length_tokens = len(tokens) for x in range(0, length_tokens - 1): two_grams.append(tokens[x] + "," + tokens[x+1]) return two_grams def _three_grams(self,tokens): three_grams = [] length_tokens = len(tokens) if len(tokens) <= 3: for x in range (0, length_tokens - 2): three_grams.append(tokens[x] + "," + tokens[x+1] + "," + tokens[x+2]) return three_grams else: return [] def _four_grams(self,tokens): four_grams = [] length_tokens = len(tokens) if len(tokens) <= 3: for x in range (0, length_tokens - 3): four_grams.append(tokens[x] + "," + tokens[x+1] + "," + tokens[x+2] + "," + tokens[x+3]) return four_grams else: return [] def _tokenise(self, string): tokens = [] pos = 0 end_parens = re.compile("(.*?[)])") end_brackets = re.compile("(.*?[}])") end_square = re.compile("(.*?[]])") words = re.compile("\w+") punctuation = re.compile("[+*-=&^%#!?,.:;]") while pos < len(string): if words.match(string[pos:]): result = words.match(string[pos:]).end() tokens.append(string[pos:pos + result]) pos += result elif punctuation.match(string[pos:]): tokens.append(string[pos:pos + 1]) pos += 1 elif string[pos] == "(": tokens.append("(") result = end_parens.match(string[pos:]) if result: pos += result.end() - 1 else: pos += 1 elif string[pos] == ")": tokens.append(")") pos += 1 elif string[pos] == "{": tokens.append("{") result = end_brackets.match(string[pos:]) if result: pos += result.end() - 1 else: pos += 1 elif string[pos] == "}": tokens.append("}") pos += 1 elif string[pos] == "[": tokens.append("[") result = end_square.match(string[pos:]) if result: pos += result.end() - 1 else: pos += 1 elif string[pos] == "]": tokens.append("]") pos += 1 else: pos += 1 return tokens``````