Commit 62e27000 authored by Alexander Young's avatar Alexander Young

Initial commit

parents

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# dotenv
.env
# virtualenv
.venv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
*.swp
from nainokami.bayes_calc import BayesCalc
import cProfile
r = BayesCalc("""
package org.elasticsearch.tasks;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.logging.log4j.util.Supplier;
import org.elasticsearch.common.logging.Loggers;
/**
* A TaskListener that just logs the response at the info level. Used when we
* need a listener but aren't returning the result to the user.
*/
public final class LoggingTaskListener<Response> implements TaskListener<Response> {
private static final Logger logger = Loggers.getLogger(LoggingTaskListener.class);
/**
* Get the instance of NoopActionListener cast appropriately.
*/
@SuppressWarnings("unchecked") // Safe because we only toString the response
public static <Response> TaskListener<Response> instance() {
return (TaskListener<Response>) INSTANCE;
}
private static final LoggingTaskListener<Object> INSTANCE = new LoggingTaskListener<Object>();
private LoggingTaskListener() {
}
@Override
public void onResponse(Task task, Response response) {
logger.info("{} finished with response {}", task.getId(), response);
}
@Override
public void onFailure(Task task, Throwable e) {
logger.warn((Supplier<?>) () -> new ParameterizedMessage("{} failed with exception", task.getId()), e);
}
}
""")
print(": " + str(r.generate_probabilities()))
from .tokeniser import NainokamiTokeniser
import mysql.connector
from functools import reduce
import math
from multiprocessing import Pool, Manager, cpu_count
class BayesCalc:
def __init__(self, snippet):
self._tokeniser = NainokamiTokeniser(snippet)
self._grams = self._tokeniser.process()
self._connection = mysql.connector.connect(user='root',
password='abc123', database='bayesian_training')
self._cursor = self._connection.cursor()
self._languages = self._lookup_languages()
self._grams_found = self._lookup_grams()
self._sum_of_occurences = 0
def _lookup_grams(self):
gram_ids = []
for gram in self._grams:
self._cursor.execute("SELECT id FROM grams where gram = %s", (gram,))
gram_id = self._cursor.fetchone()
if gram_id is not None:
gram_ids.append(gram_id[0])
return gram_ids
def generate_probabilities(self):
probabilities = []
for lang in self._languages:
probability = self._calculate_language_probability(lang)
probabilities.append(probability)
return probabilities
def _lookup_languages(self):
self._cursor.execute("SELECT * FROM languages")
return self._cursor.fetchall()
def _calculate_language_probability(self, language_tuple):
lang_id = language_tuple[0]
lang_name = language_tuple[1]
self._cursor.execute("SELECT SUM(number) FROM occurences WHERE language_id = %s", (lang_id,))
lang_denominator = self._cursor.fetchone()[0]
numerator_args = [self._probability_of_gram_given_lang(x, lang_id, lang_denominator) for x in self._grams_found]
numerator = reduce(lambda x, y: x+y, numerator_args)
#denominator_args = [self._probability_of_gram(x) for x in self._grams_found]
#denominator = reduce(lambda x, y: x+y, denominator_args)
probability = numerator#/denominator
return (lang_id, lang_name, probability)
def _probability_of_gram_given_lang(self, gram_id, lang_id, denominator):
self._cursor.execute("SELECT SUM(number) FROM occurences WHERE gram_id = %s AND language_id = %s", (gram_id, lang_id))
numerator = self._cursor.fetchone()[0]
if numerator is None:
numerator = 1
#self._cursor.execute("SELECT number FROM occurences WHERE language_id = %s", (lang_id,))
#denominator = self._cursor.fetchone()[0]
#denominator_list = self._cursor.fetchall()
#denominator = 0
#for den in denominator_list:
# denominator += den[0]
return math.log(float(numerator)/float(denominator))
def _probability_of_gram(self, gram_id):
self._cursor.execute("SELECT SUM(number) FROM occurences WHERE gram_id = %s", (gram_id,))
numerator = self._cursor.fetchone()[0]
if self._sum_of_occurences == 0:
self._cursor.execute("SELECT SUM(number) FROM occurences")
self._sum_of_occurences = self._cursor.fetchone()[0]
denominator = self._sum_of_occurences
return math.log(float(numerator)/float(denominator))
import re
class NainokamiTokeniser:
def __init__(self, snippet):
self._lines = snippet.splitlines()
def process(self):
grams = []
for x in self._lines:
tokens = self._tokenise(x)
grams += tokens
grams += self._two_grams(tokens)
grams += self._three_grams(tokens)
grams += self._four_grams(tokens)
return grams
def _two_grams(self, tokens):
two_grams = []
length_tokens = len(tokens)
for x in range(0, length_tokens - 1):
two_grams.append(tokens[x] + "," + tokens[x+1])
return two_grams
def _three_grams(self,tokens):
three_grams = []
length_tokens = len(tokens)
if len(tokens) <= 3:
for x in range (0, length_tokens - 2):
three_grams.append(tokens[x] + "," + tokens[x+1] + "," + tokens[x+2])
return three_grams
else:
return []
def _four_grams(self,tokens):
four_grams = []
length_tokens = len(tokens)
if len(tokens) <= 3:
for x in range (0, length_tokens - 3):
four_grams.append(tokens[x] + "," + tokens[x+1] + "," + tokens[x+2] + "," + tokens[x+3])
return four_grams
else:
return []
def _tokenise(self, string):
tokens = []
pos = 0
end_parens = re.compile("(.*?[)])")
end_brackets = re.compile("(.*?[}])")
end_square = re.compile("(.*?[]])")
words = re.compile("\w+")
punctuation = re.compile("[+*-=&^%#!?,.:;]")
while pos < len(string):
if words.match(string[pos:]):
result = words.match(string[pos:]).end()
tokens.append(string[pos:pos + result])
pos += result