Commit 638a750e authored by Kathryn Elliott's avatar Kathryn Elliott

Added text cleaning from LDA-tutorial.py. And debugged.

parent 844565b2
......@@ -28,8 +28,10 @@ def clean_newsgroup_posts(input, output):
def clean_newsgroup_post(line):
line = re.sub('(From|Subject|Nntp-Posting-Host|NNTP-Posting-Host|Organization|Lines|Article-I.D.|Distribution|Expires|Reply-To|X-Newsreader|Originator): .*', '', line.rstrip())
line = re.sub('(Thanks|Regards|---|====|____|\*\*\*\*|####|\%\%\%\%|\.\.\.\.|~~~~|==>).*', '', line.rstrip())
line = re.sub('(\n|\t|>>|\\\\|\||^^^|!-*-!-*-)', ' ', line)
line = re.sub('(\S*@\S*\s?)', '', line)
line = re.sub('(\n|\t|>>|\\\\|\||^^^|!-*-!-*-)', ' ', line.rstrip())
line = re.sub('(\S*@\S*\s?)', '', line.rstrip()) # Removing emails
line = re.sub('\s+', ' ', line.rstrip()) # Removing new lines
line = re.sub("\'", "", line) # Removing single quotes
print(line)
return(line)
......
#!/usr/bin/env python3
import re
import json
import sys
import click
@click.command()
@click.argument('input', type=click.File('r'))
@click.argument('output', type=click.File('w'))
def clean_newsgroup_posts(input, output):
print("hi?")
output_list = []
inputjson = json.load(input)
outputjson = inputjson
for line in inputjson["content"]:
print(".", end='', flush=True)
outputjson["content"][line] = clean_newsgroup_post(inputjson["content"][line])
json.dump(outputjson, output, sort_keys=True, indent=4)
return(outputjson)
def clean_newsgroup_post(line):
line = re.sub('(From|Subject|Nntp-Posting-Host|NNTP-Posting-Host|Organization|Lines|Article-I.D.|Distribution|Expires|Reply-To|X-Newsreader|Originator): .*', '', line.rstrip())
line = re.sub('(Thanks|---|====|____|\*\*\*\*|####|\%\%\%\%|\.\.\.\.).*', '', line)
print(line)
return(line)
print("########################################################")
line = "From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"
#print(clean_newsgroup_post(line))
if __name__ == '__main__':
clean_newsgroup_posts()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment