Commit 6b4ecd75 authored by Milan Bartky's avatar Milan Bartky

Added new features & cleaned up the code

parent 5367f8c4
......@@ -17,6 +17,14 @@ class Helper
return (substr($haystack, -$length) === $needle);
}
/**
* Checks if a string contains a substring
*/
public static function includes($hay, $needle)
{
return strpos($hay, $needle) !== false;
}
/**
* Function for listing json files recursively
*/
......@@ -100,6 +108,33 @@ class NormalizeHelper
return $text;
}
/**
* Cleanes up the text some more.
*/
public static function advancedCleanup(&$text)
{
$text = preg_replace('/["§$%{}\[\]]/m', '', $text);
$text = str_replace(".", ".\n", $text);
$text = str_replace("?", "?\n", $text);
$text = str_replace("!", "!\n", $text);
$text = str_replace(">", "\n>", $text);
$text = str_replace(";", ";\n", $text);
$text = preg_replace('/^( +)/m', '', $text);
$text = preg_replace('/( +)$/m', '', $text);
$text = preg_replace('/(\n+)/m', "\n", $text);
$text = implode("\n", array_map("ucfirst", explode("\n", $text)));
$text = preg_replace('/([a-z\d])$/mi', '$0.', $text);
preg_match_all('/^.+ .+ .+$/mi', $text, $matches);
$text = implode("\n", $matches[0]);
return $text;
}
/**
* Removes references from a given text (>234).
*/
......
......@@ -6,7 +6,7 @@ include "markov.php";
if (isset($_GET["g"]) || !file_exists("data.txt"))
{
// Including vichans config (for database credentials)
require_once("../inc/instance-config.php");
require_once("../../www/inc/instance-config.php");
// Making sure the cache file is deleted
unlink("data.txt");
......@@ -107,20 +107,118 @@ $origtext = file_get_contents("data.txt");
// Shuffling lines
$text = Helper::shuffleLines($origtext);
// Removing unicode to utf-8 errors
$text = preg_replace('/[\x10-\x1F\x80-\xFF]/', '', $text);
// Building markov table and then generating text with a length of 200 characters
$length = 200;
$table = generate_markov_table($text, 5);
$generated = generate_markov_text(200, $table, 5);
$generated = generate_markov_text($length, $table, 5);
/*
* Cleaning up generated String with a bunch of regexes
* Cleaning up generated String with some more regex functions.
*/
$generated = NormalizeHelper::quickCleanup($generated);
NormalizeHelper::advancedCleanUp($generated);
/*
* Printing generated String
* If the Get parameter w is set and if the string inside w is contained in the original text.
* We will generated only lines which contain the word set in w.
*/
$final = "";
$length = 4;
header("Content-type: text/plain");
echo trim($generated);
if (isset($_GET["w"]) && !Helper::includes($text, $_GET["w"]))
{
/*
* For the amount of lines we want
*/
for($x=0;$x<$length;$x++)
{
/*
* We will generate text so long until we have a line which contains the word.
* Then we will append that line to the final string.
*/
while ((!Helper::includes($generated, $_GET["w"]) || Helper::includes($final, $generated)) && sizeof(explode("\n", $final)) < $length)
{
// Generating some more text using markov chains
$generated = generate_markov_text(200, $table, 5);
// Cleaning the generated text up
$generated = NormalizeHelper::quickCleanup($generated);
NormalizeHelper::advancedCleanUp($generated);
// Getting only the lines that contain the word.
preg_match_all('/^(.*)'.preg_quote($_GET["w"]).'(.*)$/mi', $generated, $matches);
$generated = implode("\n", $matches[0]);
}
// Only adding the lines when the string is not already at the target length.
if (sizeof(explode("\n", $final)) < $length)
{
if (!empty($final))
{
$final .= "\n";
}
$final .= $generated;
}
}
// Marking all occurrences of the wanted string fat.
$generated = str_replace($_GET["w"], "<b>".$_GET["w"]."</b>", $final);
} else if (isset($_GET["w"]) && !Helper::includes($text, $_GET["w"]))
{
/*
* If the word was not found inside te original text, there is no chance
* Of it ever appearing, so we die, with a warning message.
*/
die("Word was not found at all!");
}
?>
<html>
<head>
<title>Steve</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="viewport" content="width=device-width, initial-scale=1">
<script src="./markov-text.js"></script>
<style>
pre {
white-space: pre-wrap;
word-wrap: break-word;
-moz-control-character-visibility: visible;
}
html:not([dir]) pre {
unicode-bidi: plaintext;
}
</style>
<script>
// Passing the original text to the javascript.
var data = `<?php echo str_replace("`", "", $text); ?>`;
</script>
</head>
<body>
<!-- Outputting the generated text. -->
<pre><?php echo trim($generated); ?></pre>
<!-- Letting JavaScript also try to generate some text using markov chains. -->
<br>
<br>
<br>
<pre id="javascript">Loading...</pre>
<script>
// Generating new MarkovText Object with a word depth of 2.
var markv = new MarkovText(2);
// Letting it build its table using the oriinal text we passed over earlier.
markv.learn(data);
// Letting it generate 20 words.
var generatedWords = markv.output(20);
// Outputting the generated words to the pre element with the id "javascript".
document.getElementById("javascript").innerText = generatedWords;
</script>
</body>
</html>
/**
* MarkovText object for generating text in a markov like way
* @param {Number} WordDepth Depth of each key
* @param {String} Text (Optional) Text to learn from
*/
function MarkovText(wordDepth, text) {
// Essentially the n in n-gram
this.wordDepth = wordDepth || 2;
this.words = {};
if(text) {
this.learn(text);
}
}
/**
* "Learn" word by word from a given text
* @param {String} Text Text to learn from (best with minimal special characters)
*/
MarkovText.prototype.learn = function(text) {
// Break up the text into individual words by spaces
var newWords = text.split(" ");
for(var i=0; i<newWords.length - this.wordDepth; i++) {
var key = "";
for(var k=0; k<this.wordDepth-1; k++) {
key += newWords[i+k] + " ";
}
// Make sure a key with these word(s) exists
if(!this.words[key]) {
this.words[key] = {
__m: 0
}
}
// See if there's an object with this key followed by the next word
if(!this.words[key][newWords[i+this.wordDepth-1]]) {
this.words[key][newWords[i+this.wordDepth-1]] = {
__i: this.words[key].__m,
__o: 1 // max index is index + occurrences
}
}
else {
this.words[key][newWords[i+this.wordDepth-1]].__o++;
}
// Shift all __m values above this up by one (unless it is this)
for(var prop in this.words[key]) {
if(this.words[key][prop].__i >= this.words[key][newWords[i+this.wordDepth-1]].__i && prop !== newWords[i+this.wordDepth-1]) {
this.words[key][prop].__i++;
}
}
this.words[key].__m++;
}
}
/**
* Produce a string of a given length given what the model knows
* @param {Number} SentenceLength Length of sentence to produce in words
* @return {String} Sentence Generated sentence
*/
MarkovText.prototype.output = function(SentenceLength) {
var key = this.randomRootWord();
var generatedWords = key.split(" ");
var outputString = generatedWords.join(" ");
for(var i=0; i<SentenceLength; i++) {
var newWord = this.findByIndex(this.randomFromZero(this.words[key].__m), this.words[key]);
var key = generatedWords.splice(1, this.wordDepth-1);
key.pop(); // Last element is always blank, pop it for easiness
key.push(newWord);
key = key.join(" ") + " ";
var generatedWords = key.split(" ");
outputString += newWord + " ";
}
return outputString;
}
/**
* Get a random word from the root word list
* @return {String} Word Random word that was found
*/
MarkovText.prototype.randomRootWord = function() {
return Object.keys(this.words)[this.randomFromZero(Object.keys(this.words).length)];
}
/**
* Generate a random number between 0 and the number passed
* @param {Number} Maximum Highest possible number to generate
* @return {Number} RandomNumber Number we generated between 0 and the number passed
*/
MarkovText.prototype.randomFromZero = function(max) {
return Math.floor(Math.random() * max);
}
/**
* Find a word given its index (and if it's nested, its object)
* @param {Number} Index Index of the
* @param {Object} WordObject Object holding possible words
* @return {String} Word Word found within that index
*/
MarkovText.prototype.findByIndex = function(index, object) {
for(var prop in object || this.words) {
if(object[prop].__i <= index &&
object[prop].__i + object[prop].__o > index &&
prop !== "__m") {
// Return the word
return prop;
}
}
}
/**
* Clear out the model's memory
*/
MarkovText.prototype.reset = function() {
this.words = {};
}
/**
* Load in a set of words
* @param {Object} Words Words to load into the
*/
MarkovText.prototype.load = function(words) {
this.words = words;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment