Commit 1216c584 authored by Milan Bartky's avatar Milan Bartky

Some refactoring and updating (Classes are cool, golang version of steve is complete garbage)

parent 9e8fb7e5
<?php
/**
* Class which has a bunch of helpful functions.
*/
class Helper
{
/**
* Checks if $haystack ends with $needle.
* @param string $haystack
* @param string $needle
* @return bool
*/
public static function endsWith(string $haystack, string $needle) : bool
{
$length = strlen($needle);
if ($length == 0) {
return true;
}
return (substr($haystack, -$length) === $needle);
}
/**
* Checks if a string contains a substring
* @param string $hay
* @param string $needle
* @return bool
*/
public static function includes(string $hay, string $needle) : bool
{
return strpos($hay, $needle) !== false;
}
/**
* Function for listing json files recursively
* @param string $dir
* @return array
*/
public static function listJsonFiles(string $dir) : array
{
$ffs = scandir($dir);
$list = [];
foreach ($ffs as $ff){
if ($ff != '.' && $ff != '..')
{
if (Helper::endsWith($ff, ".json"))
{
$list[] = $dir.'/'.$ff;
}
if(is_dir($dir.'/'.$ff))
{
$list = array_merge($list, Helper::listJsonFiles($dir.'/'.$ff));
}
}
}
return $list;
}
}
\ No newline at end of file
<?php
define(
'LINE_TERMINATORS',
[
'.',
'?',
'!',
';'
]
);
define(
'SENTENCE_SPLITTERS',
[
',',
':'
]
);
define(
'WORD_COMBINERS',
[
'&',
'/'
]
);
class Markov
{
/**
* @var array
*/
private $table;
private function normalizeText(string $text) : string
{
$text = str_replace('’', '\'', $text);
$text = str_replace(['“', '”'], '"', $text);
return $text;
}
private function quoteCharacter(string $char) : string
{
return preg_quote($char, '/');
}
private function tokenize(string $text) : array
{
$text = $this->normalizeText($text);
$markov = $this;
$quoteChar = function(string $char) use ($markov) : string {
return $markov->quoteCharacter($char);
};
$specialCharacters = implode('', array_map($quoteChar, LINE_TERMINATORS));
$specialCharacters .= implode('', array_map($quoteChar, SENTENCE_SPLITTERS));
$specialCharacters .= implode('', array_map($quoteChar, WORD_COMBINERS));
$wordRegex = '/((?i)tar\.gz(?-i))|([a-zA-Z0-9]\.){2,}|[A-Z\-\']{2,}(?![a-z])|[A-Z\-\'][a-z\-\']+(?=[A-Z][A-Za-z])|[\'\w\-]+|[' .
$specialCharacters . ']+/m';
preg_match_all($wordRegex, $text, $matches, PREG_SET_ORDER, 0);
return array_map(
function($m) {
return $m[0];
},
$matches
);
}
private function combineStrings(string $str, string $add) : string
{
if (in_array($add, LINE_TERMINATORS)) {
return $str . $add . "\n";
} else {
if (in_array($add, SENTENCE_SPLITTERS)) {
return $str . $add . " ";
} else {
if (in_array($add, WORD_COMBINERS)) {
return $str . " " . $add . " ";
}
}
}
return $str . " " . $add;
}
public function generateTable(string $text, int $look_forward = 4)
{
$words = $this->tokenize($text);
$table = [];
$lookedAtWords = [];
for ($i = 0; $i < sizeof($words) - $look_forward; $i++) {
$word = $words[$i];
for ($j = 1; $j < $look_forward; $j++) {
$word = $this->combineStrings($word, $words[$i + $j]);
}
$lookedAtWords[] = $word;
}
for ($i = 0; $i < sizeof($lookedAtWords) - 1; $i++) {
$currentWord = $lookedAtWords[$i];
$nextWord = $lookedAtWords[$i + 1];
if (!isset($table[$currentWord])) {
$table[$currentWord] = [];
}
if (!isset($table[$currentWord][$nextWord])) {
$table[$currentWord][$nextWord] = 0;
}
$table[$currentWord][$nextWord]++;
}
$this->table = $table;
}
public function generateText(int $length) : string
{
$word = array_rand($this->table);
while ($word !== ucfirst($word)) {
$word = array_rand($this->table);
}
$result = $word;
while (strlen($result) < $length) {
$nextWord = $this->returnWeightedWord($this->table[$word]);
if ($nextWord) {
$result .= ' ' . $nextWord;
$word = $nextWord;
} else {
$word = array_rand($this->table);
while ($word === ucfirst($word)) {
$word = array_rand($this->table);
}
}
}
return $result;
}
private function returnWeightedWord($array)
{
if (!$array) {
return false;
}
shuffle($array);
$total = array_sum($array);
$rand = mt_rand(1, $total);
foreach ($array as $item => $weight) {
if ($rand <= $weight) {
return $item;
}
$rand -= $weight;
}
return null;
}
}
\ No newline at end of file
<?php
/**
* Class which has a buch of helpful functions.
* Class which has a bunch of functions to normlize text.
*/
class Helper
class NormalizeHelper
{
/**
* Checks if $haystack ends with $needle.
* @param string $haystack
* @param string $needle
* @return bool
* Makes sure a string will be displayed correctly in the frontend.
* @param string $text
* @return string
*/
public static function endsWith(string $haystack, string $needle) : bool
public static function prepareTextForFrontend(string $text) : string
{
$length = strlen($needle);
if ($length == 0) {
return true;
}
$text = preg_replace('/>(\s*)/m', '', $text);
$text = preg_replace('/(?<![.!?])$/m', '.', $text);
$text = str_replace('.', ".\n", $text);
$text = str_replace('!', "!\n", $text);
$text = str_replace('?', "?\n", $text);
return (substr($haystack, -$length) === $needle);
}
$text = preg_replace('/[^a-z ,.?!\'0-9\n]/mi', '', $text);
/**
* Checks if a string contains a substring
* @param string $hay
* @param string $needle
* @return bool
*/
public static function includes(string $hay, string $needle) : bool
{
return strpos($hay, $needle) !== false;
}
$text = preg_replace('/^([^a-z]*)$/mi', '', $text);
$text = preg_replace('/^([^a-z]*)$/mi', '', $text);
$text = preg_replace('/^([^a-z]*)$/mi', '', $text);
/**
* Function for listing json files recursively
* @param string $dir
* @return array
*/
public static function listJsonFiles(string $dir) : array
{
$ffs = scandir($dir);
$list = [];
foreach ($ffs as $ff){
if ($ff != '.' && $ff != '..')
{
$text = str_replace("\n\n", "\n", $text);
if (Helper::endsWith($ff, ".json"))
{
$list[] = $dir.'/'.$ff;
}
if(is_dir($dir.'/'.$ff))
{
$list = array_merge($list, Helper::listJsonFiles($dir.'/'.$ff));
$text = preg_replace('/^( +)|( +)$/m', '', $text);
$text = preg_replace('/( {2,}})/m', ' ', $text);
$text = preg_replace('/(?<![a-z0-9,]) | (?![a-z0-9,])/mi', '', $text);
$nText = '';
$text = explode("\n", $text);
foreach($text as $t) {
$c = sizeof(explode(' ', $t));
if ($c > 1) {
if (!empty($nText)) {
$nText .= "\n";
}
$nText .= $t;
}
}
return $list;
$text = $nText;
return $text;
}
}
/**
* Class which has a bunch of functions for easier Regex handling.
*/
class RegexHelper
{
/**
* Removes ass Regex matches from a given string.
* @param string $regex
* @param string $string
* @return string
* Removes lines that were already used
* @param string $text
*/
public static function remove(string $regex, string $string) : string
public static function removeUsedLines(string &$text)
{
return preg_replace($regex, "", $string);
$lines = explode("\n", $text);
$newLines = [];
foreach($lines as $line) {
if(!exec('grep '.escapeshellarg(trim($line)).' ./steve/steve.txt')) {
$newLines[] = $line;
}
}
$text = implode("\n", $newLines);
}
/**
* Checks if a given Regex has any match in the given string.
* @param string $regex
* @param string $string
* @return bool
*/
public static function check(string $regex, string $string) : bool
{
$string = trim($string);
return preg_replace($regex, '', $string) !== $string;
}
}
/**
* Class which has a bunch of functions to normlize text.
*/
class NormalizeHelper
{
/**
* Does some initial normalizing (When reading the post).
* @param string $text
......@@ -115,86 +87,86 @@ class NormalizeHelper
* @param string $line
* @return bool
*/
private static function checkBrokenLine(string $line) : bool
{
$regexes = [
'/^([^abcdefghijklmnopqrstuvwxyz]+)/mi',
'/([^.?!]+)$/m',
'/ ([abcdefghijklmnopqrstuvwxyz]{1,2}).$/mi',
'/([^abcdefghijklmnopqrstuvwxyz.]+)[.!?]$/mi',
'/[()]/m',
'/((.+),(.+)){3,}/mi'
];
foreach($regexes as $regex)
{
if (RegexHelper::check($regex, $line))
{
return true;
}
}
return false;
}
private static function checkBrokenLine(string $line) : bool
{
$regexes = [
'/^([^abcdefghijklmnopqrstuvwxyz]+)/mi',
'/([^.?!]+)$/m',
'/ ([abcdefghijklmnopqrstuvwxyz]{1,2}).$/mi',
'/([^abcdefghijklmnopqrstuvwxyz.]+)[.!?]$/mi',
'/[()]/m',
'/((.+),(.+)){3,}/mi'
];
foreach($regexes as $regex)
{
if (RegexHelper::check($regex, $line))
{
return true;
}
}
return false;
}
/**
* Checks if a line contains a bad word (which is not twitter suitable)
* @param string $line
* @return bool
*/
private static function containsBadWords(string $line) : bool
{
$words = [
'fuck',
'nigger',
'22ch',
'4ch',
'dick',
'm00t',
'twoot',
'twot',
'jew',
'discord',
'gay',
'fag',
'Ylyl',
'mfw',
'tfw',
'tomoko',
'trip',
'dub',
'chan',
'downvote',
'upvote',
'cringe',
'reddit',
'meme',
'porn'
];
$line = strtolower($line);
return str_replace($words, '', $line) !== $line;
}
private static function containsBadWords(string $line) : bool
{
$words = [
'fuck',
'nigger',
'22ch',
'4ch',
'dick',
'm00t',
'twoot',
'twot',
'jew',
'discord',
'gay',
'fag',
'Ylyl',
'mfw',
'tfw',
'tomoko',
'trip',
'dub',
'chan',
'downvote',
'upvote',
'cringe',
'reddit',
'meme',
'porn'
];
$line = strtolower($line);
return str_replace($words, '', $line) !== $line;
}
/**
* Removes bad lines from a string
* @param string $text
* @return string
*/
public static function removeDirt(string $text) : string
public static function removeDirt(string $text, bool $checkBad = true) : string
{
$lines = explode("\n", $text);
$text = "";
foreach ($lines as $line)
{
$line = trim($line);
if (self::checkBrokenLine($line) || self::containsBadWords($line))
$line = trim($line);
if (self::checkBrokenLine($line) || ($checkBad && self::containsBadWords($line)))
{
continue;
continue;
}
if ($text !== "")
{
$text .= "\n";
}
$text .= $line;
}
return $text;
......@@ -223,22 +195,22 @@ class NormalizeHelper
*/
public static function advancedCleanup(string &$text) : string
{
$text = preg_replace('/["§$%{}\[\]]/m', '', $text);
$text = preg_replace('/["§$%{}\[\]]/m', '', $text);
$text = str_replace(".", ".\n", $text);
$text = str_replace("?", "?\n", $text);
$text = str_replace("!", "!\n", $text);
$text = str_replace(">", "\n>", $text);
$text = str_replace(";", ";\n", $text);
$text = str_replace(".", ".\n", $text);
$text = str_replace("?", "?\n", $text);
$text = str_replace("!", "!\n", $text);
$text = str_replace(">", "\n>", $text);
$text = str_replace(";", ";\n", $text);
$text = preg_replace('/^( +)/m', '', $text);
$text = preg_replace('/( +)$/m', '', $text);
$text = preg_replace('/(\n+)/m', "\n", $text);
$text = preg_replace('/^( +)/m', '', $text);
$text = preg_replace('/( +)$/m', '', $text);
$text = preg_replace('/(\n+)/m', "\n", $text);
$text = preg_replace('/([a-z\d])$/mi', '$0.', $text);
$text = preg_replace('/([a-z\d])$/mi', '$0.', $text);
preg_match_all('/^.+ .+ .+$/mi', $text, $matches);
$text = implode("\n", $matches[0]);
preg_match_all('/^.+ .+ .+$/mi', $text, $matches);
$text = implode("\n", $matches[0]);
$lines = explode("\n", $text);
foreach($lines as &$line) {
......@@ -333,38 +305,38 @@ class NormalizeHelper
* @param array $array
* @return array
*/
public static function removeEmptyElements(array &$array) : array
{
$array = array_filter($array, function($value) { return $value !== ''; });
return $array;
}
public static function removeEmptyElements(array &$array) : array
{
$array = array_filter($array, function($value) { return $value !== ''; });
return $array;
}
/**
* Lowering words that are written in capslock (HELLO -> hello).
* @param string $text
* @return string
*/
public static function lowercaseTheUpercase(string &$text) : string
{
$regex = '/([A-Z]{2,})/m';
preg_match_all($regex, $text, $matches);
foreach($matches as $match)
{
$text = str_replace($match[0], strtolower($match[0]), $text);
}
return $text;
}
public static function lowercaseTheUpercase(string &$text) : string
{
$regex = '/([A-Z]{2,})/m';
preg_match_all($regex, $text, $matches);
foreach($matches as $match)
{
$text = str_replace($match[0], strtolower($match[0]), $text);
}
return $text;
}
/**
* Removes links from a string
* @param string $text
* @return string
*/
public static function removeLinks(string &$text) : string
{
$text = RegexHelper::remove('/\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|$!:,.;]*[A-Z0-9+&@#\/%=~_|$]/i', $text);
public static function removeLinks(string &$text) : string
{
$text = RegexHelper::remove('/\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|$!:,.;]*[A-Z0-9+&@#\/%=~_|$]/i', $text);