1+ import tabula
2+ import pandas as pd
3+ import re
4+ import subprocess
5+ import os
6+
7+ def extract (file ):
8+ df = tabula .read_pdf (file , pages = "1,2" , area = (9.405 ,41.085 ,600.435 ,770.715 ),nospreadsheet = True ,guess = False , pandas_options = {'header' : None })
9+ pid = df .iloc [3 ,0 ]
10+ name = df .iloc [4 ,0 ].replace (" " ,"" ).replace ("." ,"" ).lower ().title ()
11+
12+ pid = re .findall (r'\d+' ,pid )[0 ]
13+ pid = "P{}" .format (pid )
14+
15+ # this extracts hours attempted and gpa
16+ vals = df .iloc [:,0 ]
17+ vals = vals [vals .str .contains ('TRANSCRIPT|T RANSCRIPT' ,na = False )].to_string ()
18+ vals = vals .split ('|' )
19+
20+ earned = float (vals [2 ])
21+ gpa = "Fail" if float (vals [3 ])< 3 else u'\u2714 '
22+
23+ df = df .iloc [:,0 ]
24+ df = df [df .str .contains ('Fa\d+\s+|Sp\d+\s+|Su\d+\s+' ,na = False )]
25+
26+ # this extracts lines that begin with fall, spring or summer untill it gets to the |
27+ df1 = df [df .str .contains ('^Fa\d+\s+|^Sp\d+\s+|^Su\d+\s+' ,na = False )]
28+ df1 = df1 .iloc [:, ].str .extract ('([^\|]+)' )
29+
30+ # this extracts lines where course information is in the middle of the line
31+
32+ df2 = df [df .str .contains ('^[^Fa\d+\s+]|^[^Sp\d+\s+]|^[^Su\d+\s+]' ,na = False )]
33+ df2 = df .iloc [:, ].str .extract ('(Fa\d+\s+\w*(.*)|Sp\d+\s+\w*(.*)|Su\d+\s+\w*(.*))' )
34+ df2 = df2 .iloc [:,0 :1 ]
35+
36+ total = pd .concat ([df1 ,df2 ],ignore_index = True )
37+ total .rename (columns = { total .columns [0 ]: "course" }, inplace = True )
38+ total = total .drop_duplicates ()
39+ total ['course' ] = total ['course' ].str .split ('|' ).str [0 ]
40+ total ['course' ] = total ['course' ].str .lstrip ()
41+
42+ total ['term' ] = total ['course' ].str [0 :4 ]
43+ total ['class' ] = total ['course' ].str [4 :13 ]
44+ total ['course' ] = total ['course' ].str [13 :]
45+ total ['hours' ] = total ['course' ].str .extract ("(\d*\.?\d+)" , expand = True )
46+ total ['course' ] = total ['course' ].str .replace (r"(\d*\.?)" ,"" , regex = True )
47+ total ['course' ] = total ['course' ].str .lstrip ()
48+ total ['grade' ] = total ['course' ].str [0 :2 ]
49+ total ['acad_year' ] = '20' + total ['term' ].str [2 :]
50+ total .drop (columns = ['course' ], inplace = True )
51+ total .drop_duplicates (inplace = True )
52+ total .sort_values (by = ['acad_year' ,'term' ], inplace = True )
53+
54+ fn = "Fail" if total .grade .str .contains ('FN' ,regex = True ).any () else u'\u2714 '
55+ pr = 'Fail' if total .grade .str .contains ('PR' , regex = True ).any () else u'\u2714 '
56+ wp = "Fail" if total .grade .str .contains ('WP' , regex = True ).any () else u'\u2714 '
57+ ip = "Fail" if total .grade .str .contains ('IP' , regex = True ).any () else u'\u2714 '
58+ lower_C = "Fail" if total .grade .str .contains ( 'C\-|^D$|^F$' , regex = True ).any () else u'\u2714 '
59+
60+ file = os .path .basename (file )
61+
62+ temp = [name , file , gpa , pr , fn ,wp ,ip ,lower_C ]
63+ return (temp )
64+ #return(total)
65+ # pdf launcher
66+ def launcher (file ):
67+ command = "C:\Program Files (x86)\Adobe\Acrobat Reader DC\Reader\AcroRd32.exe"
68+ return subprocess .Popen ([command , file ], stdout = subprocess .PIPE )
69+
70+ if __name__ == "__main__" :
71+ extract ()
72+ launcher ()
0 commit comments