Skip to content

Commit 6769579

Browse files
committed
Initial commit
0 parents  commit 6769579

20 files changed

+815
-0
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

.gitignore

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
pip-wheel-metadata/
24+
share/python-wheels/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
MANIFEST
29+
30+
# PyInstaller
31+
# Usually these files are written by a python script from a template
32+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33+
*.manifest
34+
*.spec
35+
36+
# Installer logs
37+
pip-log.txt
38+
pip-delete-this-directory.txt
39+
40+
# Unit test / coverage reports
41+
htmlcov/
42+
.tox/
43+
.nox/
44+
.coverage
45+
.coverage.*
46+
.cache
47+
nosetests.xml
48+
coverage.xml
49+
*.cover
50+
*.py,cover
51+
.hypothesis/
52+
.pytest_cache/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
target/
76+
77+
# Jupyter Notebook
78+
.ipynb_checkpoints
79+
80+
# IPython
81+
profile_default/
82+
ipython_config.py
83+
84+
# pyenv
85+
.python-version
86+
87+
# pipenv
88+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
90+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
91+
# install all needed dependencies.
92+
#Pipfile.lock
93+
94+
# celery beat schedule file
95+
celerybeat-schedule
96+
97+
# SageMath parsed files
98+
*.sage.py
99+
100+
# Environments
101+
.env
102+
.venv
103+
env/
104+
venv/
105+
ENV/
106+
env.bak/
107+
venv.bak/
108+
109+
# Spyder project settings
110+
.spyderproject
111+
.spyproject
112+
113+
# Rope project settings
114+
.ropeproject
115+
116+
# mkdocs documentation
117+
/site
118+
119+
# mypy
120+
.mypy_cache/
121+
.dmypy.json
122+
dmypy.json
123+
124+
# Pyre type checker
125+
.pyre/

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# DARS
2+

extract.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import tabula
2+
import pandas as pd
3+
import re
4+
import subprocess
5+
import os
6+
7+
def extract(file):
8+
df = tabula.read_pdf(file, pages="1,2", area=(9.405,41.085,600.435,770.715),nospreadsheet = True,guess=False, pandas_options={'header': None})
9+
pid = df.iloc[3,0]
10+
name = df.iloc[4,0].replace(" ","").replace(".","").lower().title()
11+
12+
pid = re.findall(r'\d+',pid)[0]
13+
pid ="P{}".format(pid)
14+
15+
# this extracts hours attempted and gpa
16+
vals = df.iloc[:,0]
17+
vals = vals[vals.str.contains('TRANSCRIPT|T RANSCRIPT',na=False)].to_string()
18+
vals = vals.split('|')
19+
20+
earned = float(vals[2])
21+
gpa = "Fail" if float(vals[3])<3 else u'\u2714'
22+
23+
df = df.iloc[:,0]
24+
df = df[df.str.contains('Fa\d+\s+|Sp\d+\s+|Su\d+\s+',na=False)]
25+
26+
# this extracts lines that begin with fall, spring or summer untill it gets to the |
27+
df1 = df[df.str.contains('^Fa\d+\s+|^Sp\d+\s+|^Su\d+\s+',na=False)]
28+
df1 = df1.iloc[:, ].str.extract('([^\|]+)')
29+
30+
# this extracts lines where course information is in the middle of the line
31+
32+
df2 = df[df.str.contains('^[^Fa\d+\s+]|^[^Sp\d+\s+]|^[^Su\d+\s+]',na=False)]
33+
df2 = df.iloc[:, ].str.extract('(Fa\d+\s+\w*(.*)|Sp\d+\s+\w*(.*)|Su\d+\s+\w*(.*))')
34+
df2 = df2.iloc[:,0:1]
35+
36+
total = pd.concat([df1,df2],ignore_index=True)
37+
total.rename(columns={ total.columns[0]: "course" }, inplace = True)
38+
total = total.drop_duplicates()
39+
total['course'] =total['course'].str.split('|').str[0]
40+
total['course'] =total['course'].str.lstrip()
41+
42+
total['term'] = total['course'].str[0:4]
43+
total['class'] = total['course'].str[4:13]
44+
total['course'] = total['course'].str[13:]
45+
total['hours'] = total['course'].str.extract("(\d*\.?\d+)", expand=True)
46+
total['course'] = total['course'].str.replace(r"(\d*\.?)","", regex=True)
47+
total['course'] = total['course'].str.lstrip()
48+
total['grade'] = total['course'].str[0:2]
49+
total['acad_year'] = '20'+total['term'].str[2:]
50+
total.drop(columns=['course'], inplace=True)
51+
total.drop_duplicates(inplace=True)
52+
total.sort_values(by=['acad_year','term'], inplace=True)
53+
54+
fn = "Fail" if total.grade.str.contains('FN',regex=True).any() else u'\u2714'
55+
pr = 'Fail' if total.grade.str.contains('PR', regex=True).any() else u'\u2714'
56+
wp = "Fail" if total.grade.str.contains('WP', regex=True).any() else u'\u2714'
57+
ip = "Fail" if total.grade.str.contains('IP', regex=True).any() else u'\u2714'
58+
lower_C = "Fail" if total.grade.str.contains( 'C\-|^D$|^F$', regex=True).any() else u'\u2714'
59+
60+
file = os.path.basename(file)
61+
62+
temp = [name, file, gpa, pr, fn,wp,ip,lower_C]
63+
return(temp)
64+
#return(total)
65+
# pdf launcher
66+
def launcher(file):
67+
command = "C:\Program Files (x86)\Adobe\Acrobat Reader DC\Reader\AcroRd32.exe"
68+
return subprocess.Popen([command, file], stdout=subprocess.PIPE)
69+
70+
if __name__ == "__main__":
71+
extract()
72+
launcher()

0 commit comments

Comments
 (0)