-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_reserve_csv.py
128 lines (86 loc) · 3.23 KB
/
parse_reserve_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Parse csv files with course reserves and clean up entries
import sys
import csv
import re
import requests
import feedparser
import pprint
import xml.etree.ElementTree as ET
import os
from operator import attrgetter
rootDir = sys.argv[1]
worldcatapikey = sys.argv[2]
startrow = 0
endrow = 0
started = False
worldcat_url = "http://www.worldcat.org/webservices/catalog/search/opensearch"
worldcat_read_search = "http://www.worldcat.org/webservices/catalog/content/"
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
# Only check pdfs
if os.path.splitext(fname)[1] != ".csv":
continue
with open(os.path.join(dirName, fname)) as f:
for i, line in enumerate(f):
if not started and "title" in line.lower():
startrow = i
if endrow==0 and "new books" in line.lower():
endrow = i
# Start CSV output
with open('reserve-readings.csv', 'a+') as csvfile:
readingWriter = csv.writer(csvfile)
# Open file, stripping off everything before title row
with open(os.path.join(dirName, fname), 'rb') as csvfile:
reserveReader = csv.reader(csvfile)
for index, row in enumerate(reserveReader):
# Skip extraneous data
if index <= startrow:
continue
# Skip on other known invalid data
# Check for the position of McCabe -- this tells us the last column
lastcol = 0
for j, item in enumerate(row):
if "mccabe" in item.lower():
lastcol = j
# Skip if can't find McCabe text
if lastcol == 0:
continue
title=""
location = ""
authors = []
if lastcol == 2:
title = row[0]
if lastcol == 3:
title = "%s %s" % (row[0], row[1])
# Remove newlines
title = re.sub(r"\n", " ", title)
print title
# Look up title in WorldCat
payload = {'format': 'atom', 'cformat': 'mla', 'count':'1', 'wskey':worldcatapikey,'q':title}
r = requests.get(worldcat_url, params=payload)
d = feedparser.parse(r.text)
#print r.url
pp = pprint.PrettyPrinter(depth=6)
pp.pprint(d)
subjects=[]
if 'entries' in d and len(d.entries)>0:
id = re.sub(r"http:\/\/worldcat\.org\/oclc\/","",d.entries[0].id)
authors = d.entries[0]['author']
title = d.entries[0].title.encode('utf-8')
singlelookup = requests.get(worldcat_read_search + id, params={'wskey':worldcatapikey, 'recordSchema': 'info:srw/schema/1/dc'})
#print singlelookup.url
#print singlelookup.text
root = ET.fromstring(singlelookup.text.encode('utf-8'))
subjects = map(attrgetter('text'), root.findall('{http://purl.org/dc/elements/1.1/}subject'))
if subjects:
for s in subjects:
print "\t"+s
# Write CSV output
print os.path.splitext(os.path.basename(fname))[0]
[coursenumber, semester] = (os.path.splitext(os.path.basename(fname))[0]).split("-")
# Course, semester, title, subjects
readingWriter.writerow([coursenumber.upper(), semester.upper(), title, authors.encode('utf-8'), id,"|".join(subjects).encode('utf-8')])
else:
print "No entries found"
print "==================="