Commit ff6c761b authored by Roman Shishkin's avatar Roman Shishkin

Merge branch 'master' of https://gitlab.com/keksquad/echoes

parents e3c3ec99 befcd3f1
This diff is collapsed.
# coding: utf-8
import string
from numpy import zeros, sum, asarray, log
from numpy.linalg import svd
from porter_stemmer import PorterStemmer
class LatentSemanticAnalyser:
def __init__(self, stop_words, ignored_characters, documents):
# Все слова из корпуса со ссылками на документы, где они встречаются
self.known_words = {}
# Все проиндексированные слова
self.dictionary = []
# Список шумовых слов, которые не включаются в индекс
self.stop_words = stop_words
# TODO: Опиши меня
self.keys = []
# Инициализируем правила нормализации
# TODO: Использовать коллекцию вместо строки
self.ignored_characters = ignored_characters
# Инициализируем документы
self.documents = []
for document in documents:
self.add_document(document)
def prepare(self):
self.build_document_term_matrix()
# TODO: Выполнять или нет нормализацию TF-IDF?
# Следует изучить влияние нормализации на обработку больших корпусов.
self.perform_tf_idf_normalization()
self.perform_singular_value_decomposition()
print("Подготовка завершена.")
def purge_punctuation(self, word):
translator = str.maketrans('', '', string.punctuation)
return word.lower().translate(translator)
def normalize_word(self, word):
word = self.purge_punctuation(word)
return PorterStemmer.stem(word)
def find_word_index(self, word, add=False):
word = self.purge_punctuation(word)
word = PorterStemmer.stem(word)
if word in self.dictionary:
return self.dictionary.index(word)
else:
if add:
self.dictionary.append(word)
return len(self.dictionary) - 1
else:
return None
def add_document(self, doc):
words = [self.find_word_index(word, True) for word in doc.lower().split()]
self.documents.append(words)
for word in words:
if word in self.stop_words:
continue
elif word in self.known_words:
self.known_words[word].append(len(self.documents) - 1)
else:
self.known_words[word] = [len(self.documents) - 1]
def build_document_term_matrix(self):
self.keys = [k for k in self.known_words.keys() if len(self.known_words[k]) > 0]
self.keys.sort()
self.document_term_matrix = zeros([len(self.keys), len(self.documents)])
for i, k in enumerate(self.keys):
for d in self.known_words[k]:
self.document_term_matrix[i, d] += 1
def perform_singular_value_decomposition(self):
U, S, Vt = svd(self.document_term_matrix)
self.unitary_matrix = U
self.singularities = S
self.transposed_unitary_matrix = Vt
def perform_tf_idf_normalization(self):
words_per_doc = sum(self.document_term_matrix, axis=0)
docs_per_word = sum(asarray(self.document_term_matrix > 0, 'i'), axis=1)
rows, columns = self.document_term_matrix.shape
for i in range(rows):
for j in range(columns):
tf = (self.document_term_matrix[i, j] / words_per_doc[j])
idf = log(float(columns) / docs_per_word[i])
self.document_term_matrix[i, j] = tf * idf
def dump_document_term_matrix(self):
self.prepare()
print('Матрица термы-на-документы:')
print()
for i, row in enumerate(self.document_term_matrix):
print('{:>30s} : {}'.format(self.dictionary[i], row))
def dump_singular_value_decomposition(self):
self.prepare()
print('Сингулярные значения:')
print(self.singularities)
print('Здесь первые 3 колонки U матрица ')
for i, row in enumerate(self.unitary_matrix):
print(self.dictionary[self.keys[i]], row[0:3])
print('Здесь первые 3 строчки Vt матрица')
print(-1 * self.transposed_unitary_matrix[0:3, :])
def find(self, keyword):
self.prepare()
index = self.find_word_index(keyword)
if not index:
print('Слово не встречается в корпусе.')
return []
if index not in self.keys:
print('Слово является стоп-символом и недоступно для поиска.')
return []
index = self.keys.index(index)
print('Слово: «{}», нормальная форма: «{}»'.format(keyword, self.dictionary[self.keys[index]]))
dimensions = 100
# Получаем координаты слова
word = (-1 * self.unitary_matrix[:, 1:(dimensions + 1)])[index]
print('Слово: «{}» (№ {}), координаты:\n{}'.format(keyword, index, word))
results = []
coordinates = -1 * self.transposed_unitary_matrix[1:(dimensions + 1), :]
for i, document in enumerate(self.documents):
vector = [coordinates[d][i] for d in range(dimensions)]
delta = [float(word[d] - vector[d]) for d in range(dimensions)]
distance = sum(i * i for i in delta)
results.append({
'number': i,
'document': document,
'vector': vector,
'distance': distance
})
return sorted(results, key=lambda record: record['distance'])
def get_documents(self):
self.prepare()
dimensions = 250
results = []
coordinates = -1 * self.transposed_unitary_matrix[1:(dimensions + 1), :]
for i, document in enumerate(self.documents):
vector = [coordinates[d][i] for d in range(dimensions)]
results.append({
'number': i,
'vector': vector,
})
return sorted(results, key=lambda record: record['number'])
# coding: utf-8
class TextNormalizer:
def __init__(self):
pass
def normalize(self, text):
for char in list('''«»"'„“”—–-!?,:;.'''):
text = text.replace(char, ' ')
return text
def test_normalize():
text = '«Кайф жгуч: мем колюч!», — сказал мэр; присутстовавшие усмехнулись.'
expected = ' Кайф жгуч мем колюч сказал мэр присутстовавшие усмехнулись '
assert TextNormalizer().normalize(text) == expected
# coding=utf-8
import re
class PorterStemmer:
RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
PERFECTIVE_GROUND = re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
REFLEXIVE = re.compile(u"(с[яь])$")
ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
VERB = re.compile(
u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)"
u"|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
NOUN = re.compile(
u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
DER = re.compile(u"ость?$")
SUPERLATIVE = re.compile(u"(ейше|ейш)$")
I = re.compile(u"и$")
P = re.compile(u"ь$")
NN = re.compile(u"нн$")
@staticmethod
def stem(word):
word = word.lower()
word = word.replace(u'ё', u'е')
m = re.match(PorterStemmer.RVRE, word)
if m and m.groups():
pre = m.group(1)
rv = m.group(2)
temp = PorterStemmer.PERFECTIVE_GROUND.sub('', rv, 1)
if temp == rv:
rv = PorterStemmer.REFLEXIVE.sub('', rv, 1)
temp = PorterStemmer.ADJECTIVE.sub('', rv, 1)
if temp != rv:
rv = temp
rv = PorterStemmer.PARTICIPLE.sub('', rv, 1)
else:
temp = PorterStemmer.VERB.sub('', rv, 1)
if temp == rv:
rv = PorterStemmer.NOUN.sub('', rv, 1)
else:
rv = temp
else:
rv = temp
rv = PorterStemmer.I.sub('', rv, 1)
if re.match(PorterStemmer.DERIVATIONAL, rv):
rv = PorterStemmer.DER.sub('', rv, 1)
temp = PorterStemmer.P.sub('', rv, 1)
if temp == rv:
rv = PorterStemmer.SUPERLATIVE.sub('', rv, 1)
rv = PorterStemmer.NN.sub(u'н', rv, 1)
else:
rv = temp
word = pre + rv
return word
......@@ -3,11 +3,12 @@
import json
from flask import request
from pony import orm
from ..root import check_user
from ...models import Post
def init(app):
@app.route('/api/posts/', methods=['GET'])
@app.route('/api/posts', methods=['GET'])
@orm.db_session
def api_posts_get_all():
limit = 10
......@@ -28,19 +29,38 @@ def init(app):
@app.route('/api/posts', methods=['POST'])
@orm.db_session
def api_posts_post():
# TODO: Check user credentials
# TODO: Insert record in database
request_json = request.get_json()
app_id = app.config['APP_ID']
app_secret = app.config['APP_SECRET']
viewer_id = request_json.get('viewer_id', None)
hash = request_json.get('hash', None)
if not check_user(app_id, viewer_id, app_secret, hash):
return json.dumps({
'status': 'error',
'error': {
'type': 'unauthorized_access_attempt',
'message': 'please provide VK user credentials',
}
})
post = Post(summary=request_json('summary', None),
details=request_json.get('details', None),
source=request_json.get('source', None),
photo=request_json.get('photo', None),
video=request_json.get('video', None),
latitude=request_json.get('latitude', None),
longitude=request_json.get('longitude', None),
author=viewer_id,
state='pending')
orm.commit()
return json.dumps({
'status': 'ok',
'data': {
'id': 228,
'id': post.id,
},
})
@app.route('/api/posts/<int:id>', methods=['GET'])
def api_posts_get(id):
# TODO: Obtain data from database
try:
post = Post[id]
return json.dumps({
......@@ -52,10 +72,12 @@ def init(app):
'source': post.source,
'photo': post.photo,
'video': post.video,
# TODO: Store latitude and longitude directly in the record maybe?
'location': post.location.to_json,
'author': 70127420,
'state': 'published',
'location': {
'latitude': post.latitude,
'longitude': post.longitude,
},
'author': post.author,
'state': post.state,
}
})
except orm.core.ObjectNotFound:
......
......@@ -13,4 +13,5 @@ speaklater==1.3
py-vkontakte==5.53.4
webassets==0.12.0
Werkzeug==0.11.15
vklancer
\ No newline at end of file
numpy
vklancer
......@@ -187,4 +187,4 @@ commands:
children:
flask: *flask
redis: *redis
celery: *celery
\ No newline at end of file
celery: *celery
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment