tokeniser.py 3 KB
Newer Older
Alexander Young's avatar
Alexander Young committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
import re

class NainokamiTokeniser:
    def __init__(self, snippet):
        self._lines = snippet.splitlines()
    
    def process(self):
        grams = []
        for x in self._lines:
            tokens = self._tokenise(x)
            grams += tokens
            grams += self._two_grams(tokens)
            grams += self._three_grams(tokens)
            grams += self._four_grams(tokens)
        return grams

    def _two_grams(self, tokens):
        two_grams = []
        length_tokens = len(tokens)
        for x in range(0, length_tokens - 1):
            two_grams.append(tokens[x] + "," + tokens[x+1])
        return two_grams

    def _three_grams(self,tokens):
        three_grams = []
        length_tokens = len(tokens)
        if len(tokens) <= 3:
            for x in range (0, length_tokens - 2):
                three_grams.append(tokens[x] + "," + tokens[x+1] + "," + tokens[x+2])
            return three_grams
        else:
            return []

    def _four_grams(self,tokens):
        four_grams = []
        length_tokens = len(tokens)
        if len(tokens) <= 3:
            for x in range (0, length_tokens - 3):
                four_grams.append(tokens[x] + "," + tokens[x+1] + "," + tokens[x+2] + "," + tokens[x+3])
            return four_grams
        else:
            return []


    def _tokenise(self, string):
        tokens = []
        pos = 0
        end_parens = re.compile("(.*?[)])")
        end_brackets = re.compile("(.*?[}])")
        end_square = re.compile("(.*?[]])")
        words = re.compile("\w+")
        punctuation = re.compile("[+*-=&^%#!?,.:;]")
        while pos < len(string):
            if words.match(string[pos:]):
                result = words.match(string[pos:]).end()
                tokens.append(string[pos:pos + result])
                pos += result
            elif punctuation.match(string[pos:]):
                tokens.append(string[pos:pos + 1])
                pos += 1
            elif string[pos] == "(":
                tokens.append("(")
                result = end_parens.match(string[pos:])
                if result:    
                    pos += result.end() - 1
                else:
                    pos += 1
            elif string[pos] == ")":
                tokens.append(")")
                pos += 1
            elif string[pos] == "{":
                tokens.append("{")
                result = end_brackets.match(string[pos:])
                if result:    
                    pos += result.end() - 1
                else:
                    pos += 1
            elif string[pos] == "}":
                tokens.append("}")
                pos += 1
            elif string[pos] == "[":
                tokens.append("[")
                result = end_square.match(string[pos:])
                if result:    
                    pos += result.end() - 1
                else:
                    pos += 1
            elif string[pos] == "]":
                tokens.append("]")
                pos += 1

            else:
                pos += 1
        return tokens