Historical Spellchecker

[insert_php]

function getWords($line) {
$line = htmlspecialchars_decode($line, ENT_QUOTES);
$delimiters = ” .,/?\”:;!@#$%^&*()-_+={}[]<>\n\t\r”;
$words = array();
$word = strtok($line, $delimiters);
while($word !== false) {
$word = strtolower($word);
//apostrophe handling
$word = trim($word, “‘”);
if (strpos($word, “d'”) === 0 or strpos($word, “l'”) === 0) {
$word = substr($word, 2);
}
if (substr($word, -2, 2) === “‘s” or substr($word, -2, 2) === “‘d”) {
$word = substr($word, 0, strlen($word)-2);
}

$words[$word] = $word;

$word = strtok($delimiters);
}
return $words;
}

function readCorpus($filename) {
$corpus = array();
$corpusfile = fopen( $filename, “r” ) or die(“Couldn’t open $filename to read corpus”);
flock($corpusfile, LOCK_SH);
while( ! feof($corpusfile) ) {
$word = trim(fgets($corpusfile));
$corpus[$word] = $word;
}
flock($corpusfile, LOCK_UN);
fclose($corpusfile);
return $corpus;
}

function getWordsNotInCorpus($userInput, $currCorpus) {
$flagged = array();
$wordsInText = getWords($userInput);
foreach($wordsInText as $currWord) {
if(!array_key_exists($currWord, $currCorpus)) {
$flagged[$currWord]=$currWord;
}
}
return $flagged;
}

function format_input($data)
{
$data = trim($data);
$data = stripslashes($data);
$data = htmlspecialchars($data, ENT_QUOTES);
$find = array(“…”, “…”, “–”, “–”, “—”, “—”, “‘”, “‘”, “’”, “’”, ““”, ““”, “””, “””, “<p>”, “</p>”, “<br>”, “<br />”);
$replace = array(“…”,”…”,”-“,”-“,”–“,”–“,”'”,”'”,”'”,”'”,”"”,”"”,”"”,”"”,”\n”,””,””,””);
$data = str_replace($find, $replace, $data);
return $data;
}

function format_textname($txt) {
$txt = str_replace(“-“,” “,$txt);
$txt = str_replace(” “,” – “,$txt);
return $txt;
}

// define variables and set to empty values
$textdir = $_SERVER[‘DOCUMENT_ROOT’].”/doohickies/texts”;
$textnamesErr = $ficErr = $corpusErr = $flaggedWordsErr = “”;
$fic = “”;
$textnames = $selectedCorpus = $flaggedWords = array();

if ($_SERVER[“REQUEST_METHOD”] == “POST”)
{
if (empty($_POST[“textnames”]))
{$textnamesErr = “* Please select at least one text”;}
elseif (count($_POST[“textnames”]) > count(scandir($textdir))) {
$textnamesErr = “* Too many texts selected”;
}
else
{
foreach($_POST[“textnames”] as $i => $textname) {
$textnames[$i] = format_input($textname);
// check that file exists
if (!file_exists(“$textdir/$textname.txt”))
{
$textnamesErr = “* $textname does not exist”;
}
}
}

if (empty($_POST[“fic”]))
{$ficErr = “* Story field can’t be empty”;}
elseif (strlen($_POST[“fic”]) > 35000) {
$ficErr = “* Story text is too long”;
}
else {
$fic = format_input($_POST[“fic”]);
}

if (empty($textnamesErr) && empty($ficErr)) {
foreach($textnames as $textname) {
$selectedCorpus = array_merge($selectedCorpus, readCorpus(“$textdir/$textname.txt”));
}
$flaggedWords = getWordsNotInCorpus($fic, $selectedCorpus);
}

}
echo “

“;
echo “Select a text to use as a dictionary (shift or control for multiple):”;
$textlist = scandir($textdir) or die(“ERROR: Couldn’t find folder with texts.”);
echo “\n”;
echo $textnamesErr;

echo “
Enter your story here (25,000 characters maximum, about 4k-5k words): “.$ficErr;
echo “
“;
echo “\n* Note: WordPress’s auto-format might convert your dashes and ellipses and insert HTML tags at line breaks. Please don’t use what’s in the form as a working copy unless you can live with that.“;
echo “\n\n“;
echo “

“;

echo “

Dictionary texts:

\n”;
foreach($textnames as $textname) {
$displaytextname = format_textname($textname);
echo “$displaytextname
\n”;
}
echo “\n\n

Words in story that aren’t in dictionary:

\n”;
foreach($flaggedWords as $flaggedWord) {
echo “$flaggedWord “;
}
echo “\n\n

Story text:

\n”;
$displayfic = $fic;
foreach($flaggedWords as $flaggedWord) {
$flaggedWord = htmlspecialchars($flaggedWord, ENT_QUOTES);
if ($flaggedWord != “span” && $flaggedWord != “style” && $flaggedWord != “color” && $flaggedWord != “red”) {
$rePattern = “/([^\\w&])(“.$flaggedWord.”)([\\W_])/i”;
$displayfic = preg_replace($rePattern, “$1$2$3″, $displayfic);
}
}
echo $displayfic;
[/insert_php]

Leave a Comment

CAPTCHA * Time limit is exhausted. Please reload CAPTCHA.