Commit a8da1f59 authored by Ilya Prokhorov's avatar Ilya Prokhorov

hunspell integration

parent fdc0460a
import re
import sys
regexp = "wall_post_text zoom_text\">(.*?)\</div"
regexp = "wall_post_text\">(.*?)\</div"
filename = sys.argv[1]
file = open(filename, "r")
......@@ -17,8 +17,9 @@ for m in match:
m = m.replace("<br>"," ")
if m.startswith("<a href"):
continue
print(m)
m = m.split("<br> <br>")[0].split(" ")[0]
formattedMatches.append(m)
print(m)
file = open("output.txt","w")
file.write("\n".join(formattedMatches))
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
......@@ -2,17 +2,23 @@ from textgenrnn import textgenrnn
from os import path
import sys
from random import uniform
import string
import hunspell
weights_filename = "textgenrnn_weights.hdf5"
vocab_filename = "textgenrnn_vocab.json"
config_filename = "textgenrnn_config.json"
default_dataset_filename = "dataset_quotes_ru.txt"
already_generated = []
print("Textgenrnn quotes text generator by demensdeum 2018 (demensdeum@gmail.com)")
spellChecker = hunspell.HunSpell("index.dic", "index.aff")
command_mode = False
if len(sys.argv) > 1:
if len(sys.argv) == 2:
print("Silent mode enabled")
command_mode = True
command_mode_state = sys.argv[1]
......@@ -60,7 +66,7 @@ if state == "train_reset" or state == "train_resume":
while True:
print("Endless train mode, every 4 epochs will be saved. CTRL+C to exit")
try:
textgen.train_from_file(dataset_file, num_epochs = 4, batch_size = 8096, new_model = reset_model)
textgen.train_from_file(dataset_file, num_epochs = 4, batch_size = 10024, new_model = reset_model)
except KeyboardInterrupt:
print("\nKilled")
exit(0)
......@@ -82,11 +88,11 @@ elif state == "generate" or state == "generate_unique":
if state == "generate_unique":
if command_mode == True:
if len(sys.argv) != 3:
if len(command_mode_state_array) != 3:
print("Incorrect generate_unique format, must be - generate_unique,10,dataset.txt")
exit(5)
else:
dataset_file = sys.argv[2]
dataset_file = command_mode_state_array[2]
else:
dataset_file = input("dataset filename? (%s) " % default_dataset_filename)
if len(dataset_file) < 1:
......@@ -96,6 +102,12 @@ elif state == "generate" or state == "generate_unique":
dataset_file_text = dataset_file_descriptor.read()
dataset_file_text_set = set(dataset_file_text.split("\n"))
filtered_dataset_file_text_set = []
for line in dataset_file_text_set:
line = ''.join(character for character in line if character not in string.punctuation)
filtered_dataset_file_text_set.append(line)
if len(dataset_file_text_set) < 1:
print("Incorrent dataset... Exit")
exit(6)
......@@ -104,14 +116,22 @@ elif state == "generate" or state == "generate_unique":
textgen = textgenrnn(weights_path = weights_filename, vocab_path = vocab_filename, config_path = config_filename)
if state == "generate":
generated = textgen.generate(temperature = uniform(0.8, 1.5), return_as_list=True)
generated = textgen.generate(temperature = uniform(0.5, 0.6), return_as_list=True)
print(generated[0])
elif state == "generate_unique":
generated = list(dataset_file_text_set)[0]
while generated in dataset_file_text_set:
generated = textgen.generate(temperature = uniform(0.8, 1.5), return_as_list=True)
while generated in filtered_dataset_file_text_set or generated in already_generated:
generated = textgen.generate(temperature = uniform(0.2, 2.0), return_as_list=True)
generated = generated[0]
generated = ''.join(character for character in generated if character not in string.punctuation)
generated = generated.split()
for word in generated:
if spellChecker.spell(word) == False:
generated = list(dataset_file_text_set)[0]
break
generated = ' '.join(generated)
print(generated)
already_generated.append(generated)
exit(0)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment