mklist-imdb-c-expired-no

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Look up information about Norwegian movies in the IMDB and try to
calculate if a movie is in the public domain in Norway based on the
time of death for the relevant people.

Allow overriding the death year from a local file, for persons where
IMDB is missing time of death.

Copyright status is according to Norwegian regulations affected by
death year of main director, manus writer, dialog writer and composer
of the film music.  In Norway, movies enter the public domain 70 years
after the death of the longest living these.  The challenge is figuring
out who they were and when they died.
"""

import argparse
import datetime
import json
import lxml.html
import movielib
import time
import urlparse

vernetid = 70 # år
thisyear = datetime.datetime.now().year

# Make sure we only look up a person once per run
personcache = {}

# Is the movie available from the Norwegian National Library?
nbstatus = {}

# Information currently missing in IMDB
extra = {}

def loadinfo(path):
    with open(path, 'rt') as input:
        return json.load(input)

def fetch_person_info(personurl):
    global personcache
    if personurl in personcache:
        return personcache[personurl]
    #print personurl
    html = movielib.http_get_read(personurl)
    root = lxml.html.fromstring(html)
    name = root.cssselect("h1.header span.itemprop")[0].text_content()
    info = {
        'name': name,
        'imdb' : personurl,
    }
    deadlist = root.cssselect("div#name-death-info time[datetime]")
    if deadlist:
        deathyear = int(deadlist[0].attrib['datetime'].split("-")[0])
        info['dead'] = deathyear
    else:
        info['deathmissing'] = True
    personcache[personurl] = info
    return info

def fetch_title_info(titleurl):
    """
Fetch information from the sources available (currently
IMDB <title-URL>/fullcredits).
"""
    titleinfo = {}
    titlecreditsurl = titleurl + 'fullcredits'
    #print titlecreditsurl
    chtml = movielib.http_get_read(titlecreditsurl)
    croot = lxml.html.fromstring(chtml)
    for h in croot.cssselect("h4"):
        #print h.text_content()
        for prefix, contributor in \
            [("Directed by", "director"),
             ("Cinematography by", "photo"),
             ("Film Editing by", "editing"),
             ("Writing Credits", "creator"),
             ("Music by", "music")]:
            if  -1 != h.text_content().find(prefix):
                #print "MATCH"
                for a in h.getnext().cssselect("a[href]"):
                    url = urlparse.urljoin(titlecreditsurl,
                                           a.attrib['href']).split("?")[0]
                    info = fetch_person_info(url)
                    if not contributor in titleinfo:
                        titleinfo[contributor] = []
                    titleinfo[contributor].append(info)
    return titleinfo

def fetch_search_result(entries, urlbase, page):
    count = 0
    url = urlbase % page
    #print url
    html = movielib.http_get_read(url)
    root = lxml.html.fromstring(html)
    for h in root.cssselect("div.lister-item-content h3.lister-item-header "):
        t = h.cssselect("a[href]")
        if t:
            ta = t[0].cssselect("a")[0]
            imdburl = urlparse.urljoin(url, ta.attrib['href'].split('?')[0])
            title = t[0].text_content().strip()
            y = h.cssselect("span.lister-item-year")
            year = int(y[0].text_content().translate(None, '()I '))
            titleinfo = fetch_title_info(imdburl)
            titleinfo['title'] = title
            if '' != year:
                titleinfo['year'] = int(year)

            if imdburl in extra:
                for k in extra[imdburl]:
                    if list == type(extra[imdburl][k]):
                        for extrap in extra[imdburl][k]:
                            info = fetch_person_info(extrap['imdb'])
                            if k in titleinfo:
                                for p in titleinfo[k]:
                                    if info['imdb'] == p['imdb']:
                                        found = true
                                if not found:
                                    titleinfo[k].append(info)
                            else:
                                titleinfo[k] = [info]

            lastdeath = 0
            # Note, the photographer is not considered here
            for c in ['director', 'creator', 'music']:
                if c in titleinfo:
                    for p in titleinfo[c]:
                        if  'dead' in p and lastdeath < p['dead']:
                            lastdeath = p['dead']
            if 0 == lastdeath:
                lastdeath = year
            titleinfo['yearexpire'] = lastdeath + vernetid
            if 0 != lastdeath and titleinfo['yearexpire'] <= thisyear:
                titleinfo['status'] = 'candidate'
            else:
                titleinfo['status'] = 'unknown'
            titleinfo['vernetid'] = titleinfo['yearexpire'] - year
            inlibrary = " "
            if imdburl in nbstatus:
                for k in ('inlibrary', 'fromnb'):
                    titleinfo[k] = nbstatus[imdburl][k]
                if 'yes' == titleinfo['inlibrary']:
                    inlibrary = "*"
            entries[imdburl] = titleinfo
            print(u"%s%10s %s-%s (%3d år) %s - %s" % (inlibrary, titleinfo['status'],
                                          year, titleinfo['yearexpire'], titleinfo['vernetid'],
                                          title, imdburl))
            #print titleinfo
            count = count + 1
    next = root.cssselect("a.lister-page-next")
    if next:
        return True
    return False

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', type=str,
                        default='free-movies-imdb-c-expired-no.json',
                        help='output file [free-movies-imdb-c-expired-no.json]')
    parser.add_argument('--all', action='store_true', default=False,
                        help='include also non-feature films (default No)')
    args = parser.parse_args()
    
    global personcache
    global personcache
    personcache = loadinfo("no-author-deaths.json")
    global nbstatus
    nbstatus = loadinfo("no-movies-at-nb.json")
    global extra
    extra = loadinfo("no-imdb-extra-info.json")
    entries = {}
    # Oldest norwegian movie in IMDB is from 1907.  It is not a feature film
    start = 1907
    end = thisyear - vernetid
    for y in xrange(start, end):
        urlbase = "http://www.imdb.com/search/title?country_of_origin=no&year=%d&page=%%s" % y
        if not args.all:
            urlbase = urlbase + '&title_type=feature'
        page = 1
        while fetch_search_result(entries, urlbase, page):
            page = page + 1
        movielib.savelist(entries, name=args.output)
        time.sleep(2) # avoid overloading IMDB

if __name__ == '__main__':
    main()