Added exctract.py which extracts course information from unofficial transcripts

parent 9affc399
#!/usr/bin/env python
"""
This file is part of Graduation Audit System.
Copyright (C) 2016 Saikiran Srirangapalli <[email protected]>
Graduation Audit System is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Graduation Audit System is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Graduation Audit System. If not, see <http://www.gnu.org/licenses/>.
"""
from __future__ import print_function
import re
import sys
from subprocess import Popen, PIPE
COURSE_PREFIXES = ['ACCT', 'ACTS', 'AHST', 'AMS', 'AP', 'ARAB', 'ARHM',
'ARTS', 'ATEC', 'ATEM', 'BA', 'BCOM', 'BIOL', 'BIS',
'BLAW', 'BMEN', 'BPS', 'BUSI', 'CE', 'CGS', 'CHEM',
'CHIN', 'CLDP', 'COMM', 'COSC', 'CRIJ', 'CRIM', 'CRWT',
'CS', 'DANC', 'DRAM', 'ECON', 'ECS', 'ECSC', 'ED', 'EE',
'EMAC', 'ENGL', 'ENGR', 'ENGY', 'ENTP', 'ENVR', 'EPPS',
'FILM', 'FIN', 'FREN', 'GEOG', 'GEOL', 'GEOS', 'GERM',
'GISC', 'GOVT', 'GST', 'HIST', 'HLTH', 'HMGT', 'HONS',
'HUMA', 'IMS', 'IPEC', 'ISAE', 'ISAH', 'ISEC', 'ISIS',
'ISNS', 'ISO', 'ISSS', 'ITSS', 'JAPN', 'LANG', 'LIT',
'MATH', 'MECH', 'MECO', 'MIS', 'MKT', 'MUSI', 'NANO',
'NATS', 'NSC', 'OBHR', 'OPRE', 'PA', 'PHIL', 'PHIN',
'PHYS', 'PSCI', 'PSY', 'PSYC', 'REAL', 'RHET', 'RMIS',
'SE', 'SOC', 'SOCI', 'SOCS', 'SPAN', 'SPAU', 'SPCH',
'STAT', 'TE', 'UNIV', 'VIET']
COURSE_NUMBER = r'\s+[0-4][0-9V-][0-9-][0-9-]'
COURSE_DESCRIPTION = r'[\s+A-Z-&/]+'
COURSE_CREDIT = r'[0-9]\.000\s[0-9]\.000'
CLASS_REGEXES = [p + COURSE_NUMBER for p in COURSE_PREFIXES]
# Every course entry matches this pattern
ENTRY_REGEX = '|'.join(
[c + COURSE_DESCRIPTION + COURSE_CREDIT for c in CLASS_REGEXES])
ENTRY_REGEX = re.compile(ENTRY_REGEX)
EXTRACTION_REGEX = r'([A-Z]+)\s+([0-4][0-9V-][0-9-][0-9-])[\s+A-Z-&/]+([0-9])\.000\s([0-9])\.000'
class TranscriptConversionError(Exception):
"""
Error during conversion of unofficial transcript (pdf)
to ascii format
"""
def __init__(self, msg, cmd, err):
self.msg = msg
self.cmd = cmd
self.err = err
def extract_courses(unofficial_transcript):
"""
input: unofficial transcript pdf file (from galaxy)
output: list of tuples of courses, credit hours possible, credit hours earned
"""
# command that converts pdf to ps
pdf2ps = ['pdf2ps', '-sOutputFile=%stdout', unofficial_transcript]
# command that converts ps to ascii
ps2ascii = ['ps2ascii', '-']
to_ps = Popen(pdf2ps, stdout=PIPE, stderr=PIPE)
(ps_out, ps_err) = to_ps.communicate()
ps_code = to_ps.returncode
if ps_code != 0:
err_msg = 'Error converting pdf to postscript\n'\
'Ensure the file name was typed correctly'
raise TranscriptConversionError(msg=err_msg, cmd=pdf2ps, err=ps_err)
to_ascii = Popen(ps2ascii, stdin=PIPE, stdout=PIPE, stderr=PIPE)
(ascii_out, ascii_err) = to_ascii.communicate(input=ps_out)
ascii_code = to_ascii.returncode
if ascii_code != 0:
err_msg = 'Error converting postscript file to ascii'
raise TranscriptConversionError(
msg=err_msg, cmd=ps2ascii, err=ascii_err)
matches = ENTRY_REGEX.findall(ascii_out)
matches = re.findall(EXTRACTION_REGEX,'\n'.join(matches))
matches = [(t[0].lower()+t[1],int(t[2]),int(t[3])) for t in matches]
return matches
def main():
"""
Extracts course information from unoffical transcript (from galaxy)
Takes transcript as a command line argument
"""
if len(sys.argv) != 2:
print("Usage: python main_writer.py /path/to/transcript", file=sys.stderr)
sys.exit(1)
try:
course_data = extract_courses(sys.argv[1])
except TranscriptConversionError as err:
print(err.msg, file=sys.stderr)
print(err.cmd, file=sys.stderr)
print(err.err, file=sys.stderr)
sys.exit(1)
courses = map(lambda x: x[0], course_data)
print(','.join(courses))
if __name__ == '__main__':
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment