-
Notifications
You must be signed in to change notification settings - Fork 0
/
mklist-imdb-c-expired-no
executable file
·189 lines (172 loc) · 6.83 KB
/
mklist-imdb-c-expired-no
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Look up information about Norwegian movies in the IMDB and try to
calculate if a movie is in the public domain in Norway based on the
time of death for the relevant people.
Allow overriding the death year from a local file, for persons where
IMDB is missing time of death.
Copyright status is according to Norwegian regulations affected by
death year of main director, manus writer, dialog writer and composer
of the film music. In Norway, movies enter the public domain 70 years
after the death of the longest living these. The challenge is figuring
out who they were and when they died.
"""
import argparse
import datetime
import json
import lxml.html
import movielib
import time
import urlparse
vernetid = 70 # år
thisyear = datetime.datetime.now().year
# Make sure we only look up a person once per run
personcache = {}
# Is the movie available from the Norwegian National Library?
nbstatus = {}
# Information currently missing in IMDB
extra = {}
def loadinfo(path):
with open(path, 'rt') as input:
return json.load(input)
def fetch_person_info(personurl):
global personcache
if personurl in personcache:
return personcache[personurl]
#print personurl
html = movielib.http_get_read(personurl)
root = lxml.html.fromstring(html)
name = root.cssselect("h1.header span.itemprop")[0].text_content()
info = {
'name': name,
'imdb' : personurl,
}
deadlist = root.cssselect("div#name-death-info time[datetime]")
if deadlist:
deathyear = int(deadlist[0].attrib['datetime'].split("-")[0])
info['dead'] = deathyear
else:
info['deathmissing'] = True
personcache[personurl] = info
return info
def fetch_title_info(titleurl):
"""
Fetch information from the sources available (currently
IMDB <title-URL>/fullcredits).
"""
titleinfo = {}
titlecreditsurl = titleurl + 'fullcredits'
#print titlecreditsurl
chtml = movielib.http_get_read(titlecreditsurl)
croot = lxml.html.fromstring(chtml)
for h in croot.cssselect("h4"):
#print h.text_content()
for prefix, contributor in \
[("Directed by", "director"),
("Cinematography by", "photo"),
("Film Editing by", "editing"),
("Writing Credits", "creator"),
("Music by", "music")]:
if -1 != h.text_content().find(prefix):
#print "MATCH"
for a in h.getnext().cssselect("a[href]"):
url = urlparse.urljoin(titlecreditsurl,
a.attrib['href']).split("?")[0]
info = fetch_person_info(url)
if not contributor in titleinfo:
titleinfo[contributor] = []
titleinfo[contributor].append(info)
return titleinfo
def fetch_search_result(entries, urlbase, page):
count = 0
url = urlbase % page
#print url
html = movielib.http_get_read(url)
root = lxml.html.fromstring(html)
for h in root.cssselect("div.lister-item-content h3.lister-item-header "):
t = h.cssselect("a[href]")
if t:
ta = t[0].cssselect("a")[0]
imdburl = urlparse.urljoin(url, ta.attrib['href'].split('?')[0])
title = t[0].text_content().strip()
y = h.cssselect("span.lister-item-year")
year = int(y[0].text_content().translate(None, '()I '))
titleinfo = fetch_title_info(imdburl)
titleinfo['title'] = title
if '' != year:
titleinfo['year'] = int(year)
if imdburl in extra:
for k in extra[imdburl]:
if list == type(extra[imdburl][k]):
for extrap in extra[imdburl][k]:
info = fetch_person_info(extrap['imdb'])
if k in titleinfo:
for p in titleinfo[k]:
if info['imdb'] == p['imdb']:
found = true
if not found:
titleinfo[k].append(info)
else:
titleinfo[k] = [info]
lastdeath = 0
# Note, the photographer is not considered here
for c in ['director', 'creator', 'music']:
if c in titleinfo:
for p in titleinfo[c]:
if 'dead' in p and lastdeath < p['dead']:
lastdeath = p['dead']
if 0 == lastdeath:
lastdeath = year
titleinfo['yearexpire'] = lastdeath + vernetid
if 0 != lastdeath and titleinfo['yearexpire'] <= thisyear:
titleinfo['status'] = 'candidate'
else:
titleinfo['status'] = 'unknown'
titleinfo['vernetid'] = titleinfo['yearexpire'] - year
inlibrary = " "
if imdburl in nbstatus:
for k in ('inlibrary', 'fromnb'):
titleinfo[k] = nbstatus[imdburl][k]
if 'yes' == titleinfo['inlibrary']:
inlibrary = "*"
entries[imdburl] = titleinfo
print(u"%s%10s %s-%s (%3d år) %s - %s" % (inlibrary, titleinfo['status'],
year, titleinfo['yearexpire'], titleinfo['vernetid'],
title, imdburl))
#print titleinfo
count = count + 1
next = root.cssselect("a.lister-page-next")
if next:
return True
return False
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output', type=str,
default='free-movies-imdb-c-expired-no.json',
help='output file [free-movies-imdb-c-expired-no.json]')
parser.add_argument('--all', action='store_true', default=False,
help='include also non-feature films (default No)')
args = parser.parse_args()
global personcache
global personcache
personcache = loadinfo("no-author-deaths.json")
global nbstatus
nbstatus = loadinfo("no-movies-at-nb.json")
global extra
extra = loadinfo("no-imdb-extra-info.json")
entries = {}
# Oldest norwegian movie in IMDB is from 1907. It is not a feature film
start = 1907
end = thisyear - vernetid
for y in xrange(start, end):
urlbase = "http://www.imdb.com/search/title?country_of_origin=no&year=%d&page=%%s" % y
if not args.all:
urlbase = urlbase + '&title_type=feature'
page = 1
while fetch_search_result(entries, urlbase, page):
page = page + 1
movielib.savelist(entries, name=args.output)
time.sleep(2) # avoid overloading IMDB
if __name__ == '__main__':
main()