diff options
Diffstat (limited to 'src/fetch/scripts')
-rw-r--r-- | src/fetch/scripts/Makefile.am | 30 | ||||
-rw-r--r-- | src/fetch/scripts/boardgamegeek.rb | 235 | ||||
-rw-r--r-- | src/fetch/scripts/boardgamegeek.rb.spec | 7 | ||||
-rw-r--r-- | src/fetch/scripts/dark_horse_comics.py | 399 | ||||
-rw-r--r-- | src/fetch/scripts/dark_horse_comics.py.spec | 7 | ||||
-rwxr-xr-x | src/fetch/scripts/fr.allocine.py | 335 | ||||
-rw-r--r-- | src/fetch/scripts/fr.allocine.py.spec | 7 | ||||
-rw-r--r-- | src/fetch/scripts/ministerio_de_cultura.py | 595 | ||||
-rw-r--r-- | src/fetch/scripts/ministerio_de_cultura.py.spec | 7 |
9 files changed, 1622 insertions, 0 deletions
diff --git a/src/fetch/scripts/Makefile.am b/src/fetch/scripts/Makefile.am new file mode 100644 index 0000000..050c460 --- /dev/null +++ b/src/fetch/scripts/Makefile.am @@ -0,0 +1,30 @@ +####### kdevelop will overwrite this part!!! (begin)########## + +EXTRA_DIST = \ +fr.allocine.py fr.allocine.py.spec \ +ministerio_de_cultura.py ministerio_de_cultura.py.spec \ +dark_horse_comics.py dark_horse_comics.py.spec \ +boardgamegeek.rb boardgamegeek.rb.spec + +####### kdevelop will overwrite this part!!! (end)############ + +scriptdir = $(kde_datadir)/tellico/data-sources +script_SCRIPTS = \ +fr.allocine.py \ +ministerio_de_cultura.py \ +dark_horse_comics.py \ +boardgamegeek.rb + +script_DATA = \ +fr.allocine.py.spec \ +ministerio_de_cultura.py.spec \ +dark_horse_comics.py.spec \ +boardgamegeek.rb.spec + +KDE_OPTIONS = noautodist + +CLEANFILES = *~ + +# probably a better way to do this +uninstall-hook: + -if [ -d $(scriptdir) ]; then rmdir $(scriptdir); fi diff --git a/src/fetch/scripts/boardgamegeek.rb b/src/fetch/scripts/boardgamegeek.rb new file mode 100644 index 0000000..b3cf4f3 --- /dev/null +++ b/src/fetch/scripts/boardgamegeek.rb @@ -0,0 +1,235 @@ +#!/usr/bin/env ruby +# +# *************************************************************************** +# copyright : (C) 2006 by Steve Beattie +# : (C) 2008 by Sven Werlen +# email : sbeattie@suse.de +# : sven@boisdechet.org +# *************************************************************************** +# +# *************************************************************************** +# * * +# * This program is free software; you can redistribute it and/or modify * +# * it under the terms of version 2 of the GNU General Public License as * +# * published by the Free Software Foundation; * +# * * +# *************************************************************************** + +# $Id: boardgamegeek.rb 313 2006-10-02 22:17:11Z steve $ + +# This program is expected to be invoked from tellico +# (http://periapsis.org/tellico) as an external data source. It provides +# searches for boardgames from the boardgamegeek.com website, via +# boardgamegeek's xmlapi interface +# (http://www.boardgamegeek.com/xmlapi/) +# +# It only allows searches via name; the boardgamegeek xmlapi is not yet +# rich enough to support queries by designer, publisher, category, or +# mechanism. I'd like to add support for querying by boardgamegeek id, +# but that needs additional support in tellico. +# +# Sven Werlen: 03 Feb 2008: script has been extended to retrieve cover +# images (/thumbnail from xmlapi). Images are retrieved from the website +# and base64 is generated on-the-fly. +# +require 'rexml/document' +require 'net/http' +require 'cgi' +require "base64" +include REXML + +$my_version = '$Rev: 313 $' + +class Game + attr_writer :year + attr_writer :description + attr_writer :cover + attr_writer :image + + def initialize(name, id) + @name = name + @id = id + @publishers = [] + @designers = [] + @players = [] + end + + def add_publisher(publisher) + @publishers << publisher + end + + def add_designer(designer) + @designers << designer + end + + def add_players(players) + @players << players + end + + def to_s() + "@name (#@id #@publishers #@year)" + end + + def toXML() + element = Element.new 'entry' + element.add_element Element.new('title').add_text(@name) + element.add_element Element.new('description').add_text(@description) if @description + element.add_element Element.new('year').add_text(@year) if @year + element.add_element Element.new('boardgamegeek-link').add_text("http://www.boardgamegeek/game/#{@id}") if @id + element.add_element Element.new('bggid').add_text(@id) if @id + element.add_element Element.new('cover').add_text(@cover) if @cover + + if @publishers.length > 0 + pub_elements = Element.new('publishers') + @publishers.each {|p| pub_elements.add_element Element.new('publisher').add_text(p)} + element.add_element pub_elements + end + if @designers.length > 0 + des_elements = Element.new('designers') + @designers.each {|d| des_elements.add_element Element.new('designer').add_text(d)} + element.add_element des_elements + end + if @players.length > 0 + players_elements = Element.new('num-players') + @players.each {|n| players_elements.add_element Element.new('num-player').add_text(n.to_s)} + element.add_element players_elements + end + return element + end + + def image() + image = Element.new 'image' + image.add_attribute('format', 'JPEG') + image.add_attribute('id', @id + ".jpg") + image.add_text(@image) + return image + end +end + +def getGameList(query) + #puts("Query is #{query}") + + search_result = nil + Net::HTTP.start('www.boardgamegeek.com', 80) do + |http| search_result = (http.get("/xmlapi/search?search=#{CGI.escape(query)}", + {"User-Agent" => "BoardGameGeek plugin for Tellico #{$my_version}"}).body) + http.finish + end + doc = REXML::Document.new(search_result) + + games = XPath.match(doc, "//game") + #games.each {|g| puts g.elements['name'].text+g.attributes['gameid']} + ids = [] + games.each {|g| ids << g.attributes['gameid']} + return ids +end + +def getGameDetails(ids) + #ids.each {|id| puts id} + + query = "/xmlapi/game/#{ids.join(',')}" + #puts query + search_result = nil + Net::HTTP.start('www.boardgamegeek.com', 80) do |http| + search_result = http.get(query, {"User-Agent" => "BoardGameGeek plugin for Tellico #{$my_version}"}) + http.finish + end + games = [] + case search_result + when Net::HTTPOK then + doc = REXML::Document.new(search_result.body) + + games_xml = XPath.match(doc, "//game") + games_xml.each do |g| + if( g.elements['name'] != nil ) + game = Game.new(g.elements['name'].text, g.attributes['gameid']) + game.year = g.elements['yearpublished'].text + game.description = g.elements['description'].text + g.elements.each('publisher'){|p| game.add_publisher p.elements['name'].text} + g.elements.each('designer'){|d| game.add_designer d.elements['name'].text} + minp = Integer(g.elements['minplayers'].text) + maxp = Integer(g.elements['maxplayers'].text) + minp.upto(maxp) {|n| game.add_players(n)} + + # retrieve cover + coverurl = g.elements['thumbnail'] != nil ? g.elements['thumbnail'].text : nil + if( coverurl =~ /files.boardgamegeek.com(.*)$/ ) + # puts "downloading... " + $1 + cover = nil + Net::HTTP.start('files.boardgamegeek.com', 80) do |http| + cover = (http.get($1, {"User-Agent" => "BoardGameGeek plugin for Tellico #{$my_version}"})) + end + case cover + when Net::HTTPOK then + game.cover = g.attributes['gameid'] + ".jpg"; + game.image = Base64.encode64(cover.body); + end + else + # puts "invalid cover: " + coverurl + end + games << game + end + end + end + return games +end + +def listToXML(gameList) + doc = REXML::Document.new + doc << REXML::DocType.new('tellico PUBLIC', '"-//Robby Stephenson/DTD Tellico V10.0//EN" "http://periapsis.org/tellico/dtd/v10/tellico.dtd"') + doc << XMLDecl.new + tellico = Element.new 'tellico' + tellico.add_attribute('xmlns', 'http://periapsis.org/tellico/') + tellico.add_attribute('syntaxVersion', '10') + collection = Element.new 'collection' + collection.add_attribute('title', 'My Collection') + collection.add_attribute('type', '13') + + fields = Element.new 'fields' + field = Element.new 'field' + field.add_attribute('name', '_default') + fields.add_element(field) + field = Element.new 'field' + field.add_attribute('name', 'bggid') + field.add_attribute('title', 'BoardGameGeek ID') + field.add_attribute('category', 'General') + field.add_attribute('flags', '0') + field.add_attribute('format', '4') + field.add_attribute('type', '6') + field.add_attribute('i18n', 'true') + fields.add_element(field) + collection.add_element(fields) + + images = Element.new 'images' + + id = 0 + gameList.each do + |g| element = g.toXML() + element.add_attribute('id', id) + id = id + 1 + collection.add_element(element) + images.add_element(g.image()); + end + collection.add_element(images); + tellico.add_element(collection) + doc.add_element(tellico) + doc.write($stdout, 0) + puts "" +end + +if __FILE__ == $0 + + def showUsage + warn "usage: #{__FILE__} game_query" + exit 1 + end + + showUsage unless ARGV.length == 1 + + idList = getGameList(ARGV.shift) + if idList + gameList = getGameDetails(idList) + end + + listToXML(gameList) +end diff --git a/src/fetch/scripts/boardgamegeek.rb.spec b/src/fetch/scripts/boardgamegeek.rb.spec new file mode 100644 index 0000000..6e0aab0 --- /dev/null +++ b/src/fetch/scripts/boardgamegeek.rb.spec @@ -0,0 +1,7 @@ +Name=BoardGameGeek +Type=data-source +ArgumentKeys=1 +Arguments=%1 +CollectionType=13 +FormatType=0 +UpdateArgs=%{title} diff --git a/src/fetch/scripts/dark_horse_comics.py b/src/fetch/scripts/dark_horse_comics.py new file mode 100644 index 0000000..4f3b651 --- /dev/null +++ b/src/fetch/scripts/dark_horse_comics.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python +# -*- coding: iso-8859-1 -*- + +# *************************************************************************** +# copyright : (C) 2006 by Mathias Monnerville +# email : tellico_dev@yahoo.fr +# *************************************************************************** +# +# *************************************************************************** +# * * +# * This program is free software; you can redistribute it and/or modify * +# * it under the terms of version 2 of the GNU General Public License as * +# * published by the Free Software Foundation; * +# * * +# *************************************************************************** + +# $Id: comics_darkhorsecomics.py 123 2006-03-24 08:47:48Z mathias $ + +""" +This script has to be used with tellico (http://periapsis.org/tellico) as an external data source program. +It allows searching through the Dark Horse Comics web database. + +Related info and cover are fetched automatically. It takes only one argument (comic title). + +Tellico data source setup: +- source name: Dark Horse Comics (US) (or whatever you want :) +- Collection type: comics collection +- Result type: tellico +- Path: /path/to/script/comics_darkhorsecomics.py +- Arguments: +Title (checked) = %1 +Update (checked) = %{title} +""" + +import sys, os, re, md5, random, string +import urllib, urllib2, time, base64 +import xml.dom.minidom + +XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>""" +DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">""" +NULLSTRING = '' + +VERSION = "0.2" + + +def genMD5(): + """ + Generates and returns a random md5 string. Its main purpose is to allow random + image file name generation. + """ + obj = md5.new() + float = random.random() + obj.update(str(float)) + return obj.hexdigest() + +class BasicTellicoDOM: + """ + This class manages tellico's XML data model (DOM) + """ + def __init__(self): + self.__doc = xml.dom.minidom.Document() + self.__root = self.__doc.createElement('tellico') + self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') + self.__root.setAttribute('syntaxVersion', '9') + + self.__collection = self.__doc.createElement('collection') + self.__collection.setAttribute('title', 'My Comics') + self.__collection.setAttribute('type', '6') + + self.__images = self.__doc.createElement('images') + + self.__root.appendChild(self.__collection) + self.__doc.appendChild(self.__root) + + # Current movie id. See entry's id attribute in self.addEntry() + self.__currentId = 0 + + + def addEntry(self, movieData): + """ + Add a comic entry. + Returns an entry node instance + """ + d = movieData + entryNode = self.__doc.createElement('entry') + entryNode.setAttribute('id', str(self.__currentId)) + + titleNode = self.__doc.createElement('title') + titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8'))) + + yearNode = self.__doc.createElement('pub_year') + yearNode.appendChild(self.__doc.createTextNode(d['pub_year'])) + + countryNode = self.__doc.createElement('country') + countryNode.appendChild(self.__doc.createTextNode(d['country'])) + pubNode = self.__doc.createElement('publisher') + pubNode.appendChild(self.__doc.createTextNode(d['publisher'])) + langNode = self.__doc.createElement('language') + langNode.appendChild(self.__doc.createTextNode(d['language'])) + + writersNode = self.__doc.createElement('writers') + for g in d['writer']: + writerNode = self.__doc.createElement('writer') + writerNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) + writersNode.appendChild(writerNode) + + genresNode = self.__doc.createElement('genres') + for g in d['genre']: + genreNode = self.__doc.createElement('genre') + genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) + genresNode.appendChild(genreNode) + + commentsNode = self.__doc.createElement('comments') + #for g in d['comments']: + # commentsNode.appendChild(self.__doc.createTextNode(unicode("%s\n\n" % g, 'latin-1').encode('utf-8'))) + commentsData = string.join(d['comments'], '\n\n') + commentsNode.appendChild(self.__doc.createTextNode(unicode(commentsData, 'latin-1').encode('utf-8'))) + + artistsNode = self.__doc.createElement('artists') + for k, v in d['artist'].iteritems(): + artistNode = self.__doc.createElement('artist') + artistNode.appendChild(self.__doc.createTextNode(unicode(v, 'latin-1').encode('utf-8'))) + artistsNode.appendChild(artistNode) + + pagesNode = self.__doc.createElement('pages') + pagesNode.appendChild(self.__doc.createTextNode(d['pages'])) + + issueNode = self.__doc.createElement('issue') + issueNode.appendChild(self.__doc.createTextNode(d['issue'])) + + if d['image']: + imageNode = self.__doc.createElement('image') + imageNode.setAttribute('format', 'JPEG') + imageNode.setAttribute('id', d['image'][0]) + imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8'))) + + coverNode = self.__doc.createElement('cover') + coverNode.appendChild(self.__doc.createTextNode(d['image'][0])) + + for name in ( 'writersNode', 'genresNode', 'artistsNode', 'pagesNode', 'yearNode', + 'titleNode', 'issueNode', 'commentsNode', 'pubNode', 'langNode', + 'countryNode' ): + entryNode.appendChild(eval(name)) + + if d['image']: + entryNode.appendChild(coverNode) + self.__images.appendChild(imageNode) + + self.__collection.appendChild(entryNode) + + self.__currentId += 1 + return entryNode + + def printEntry(self, nEntry): + """ + Prints entry's XML content to stdout + """ + try: + print nEntry.toxml() + except: + print sys.stderr, "Error while outputing XML content from entry to Tellico" + + def printXMLTree(self): + """ + Outputs XML content to stdout + """ + self.__collection.appendChild(self.__images) + print XML_HEADER; print DOCTYPE + print self.__root.toxml() + + +class DarkHorseParser: + def __init__(self): + self.__baseURL = 'http://www.darkhorse.com' + self.__basePath = '/profile/profile.php?sku=' + self.__searchURL = '/search/search.php?frompage=userinput&sstring=%s&x=0&y=0' + self.__coverPath = 'http://images.darkhorse.com/covers/' + self.__movieURL = self.__baseURL + self.__basePath + + # Define some regexps + self.__regExps = { 'title' : '<font size="\+2"><b>(?P<title>.*?)</b></font>', + 'pub_date' : '<b>Pub.* Date:</b> *<a.*>(?P<pub_date>.*)</a>', + 'desc' : '<p>(?P<desc>.*?)<br>', + 'writer' : '<b>Writer: *</b> *<a.*?>(?P<writer>.*)</a>', + 'cover_artist' : '<b>Cover Artist: *</b> *<a.*>(?P<cover_artist>.*)</a>', + 'penciller' : '<b>Penciller: *</b> *<a.*>(?P<penciller>.*)</a>', + 'inker' : '<b>Inker: *</b> *<a.*>(?P<inker>.*)</a>', + 'letterer' : '<b>Letterer: *</b> *<a.*>(?P<letterer>.*)</a>', + 'colorist' : '<b>Colorist: *</b> *<a.*>(?P<colorist>.*)</a>', + 'genre' : '<b>Genre: *</b> *<a.*?>(?P<genre>.*?)</a><br>', + 'format' : '<b>Format: *</b> *(?P<format>.*?)<br>', + } + + # Compile patterns objects + self.__regExpsPO = {} + for k, pattern in self.__regExps.iteritems(): + self.__regExpsPO[k] = re.compile(pattern) + + self.__domTree = BasicTellicoDOM() + + def run(self, title): + """ + Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree + to stdout (in tellico format) so that tellico can use it. + """ + self.__getMovie(title) + # Print results to stdout + self.__domTree.printXMLTree() + + def __getHTMLContent(self, url): + """ + Fetch HTML data from url + """ + u = urllib2.urlopen(url) + self.__data = u.read() + u.close() + + def __fetchMovieLinks(self): + """ + Retrieve all links related to the search. self.__data contains HTML content fetched by self.__getHTMLContent() + that need to be parsed. + """ + matchList = re.findall("""<a *href="%s(?P<page>.*?)">(?P<title>.*?)</a>""" % self.__basePath.replace('?', '\?'), self.__data) + if not matchList: return None + + return matchList + + def __fetchCover(self, path, delete = True): + """ + Fetch cover to /tmp. Returns base64 encoding of data. + The image is deleted if delete is True + """ + md5 = genMD5() + imObj = urllib2.urlopen(path.strip()) + img = imObj.read() + imObj.close() + imgPath = "/tmp/%s.jpeg" % md5 + try: + f = open(imgPath, 'w') + f.write(img) + f.close() + except: + print sys.stderr, "Error: could not write image into /tmp" + + b64data = (md5 + '.jpeg', base64.encodestring(img)) + + # Delete temporary image + if delete: + try: + os.remove(imgPath) + except: + print sys.stderr, "Error: could not delete temporary image /tmp/%s.jpeg" % md5 + + return b64data + + def __fetchMovieInfo(self, url): + """ + Looks for movie information + """ + self.__getHTMLContent(url) + + # First grab picture data + imgMatch = re.search("""<img src="%s(?P<imgpath>.*?)".*>""" % self.__coverPath, self.__data) + if imgMatch: + imgPath = self.__coverPath + imgMatch.group('imgpath') + # Fetch cover and gets its base64 encoded data + b64img = self.__fetchCover(imgPath) + else: + b64img = None + + # Now isolate data between <div class="bodytext">...</div> elements + # re.S sets DOTALL; it makes the "." special character match any character at all, including a newline + m = re.search("""<div class="bodytext">(?P<part>.*)</div>""", self.__data, re.S) + self.__data = m.group('part') + + matches = {} + data = {} + data['comments'] = [] + data['artist'] = {} + + # Default values + data['publisher'] = 'Dark Horse Comics' + data['language'] = 'English' + data['country'] = 'USA' + + data['image'] = b64img + data['pub_year'] = NULLSTRING + + for name, po in self.__regExpsPO.iteritems(): + data[name] = NULLSTRING + if name == 'desc': + matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I) + else: + matches[name] = po.search(self.__data) + + if matches[name]: + if name == 'title': + title = matches[name].group('title').strip() + data[name] = title + # Look for issue information + m = re.search("#(?P<issue>[0-9]+)", title) + if m: + data['issue'] = m.group('issue') + else: + data['issue'] = '' + + elif name == 'pub_date': + pub_date = matches[name].group('pub_date').strip() + data['pub_year'] = pub_date[-4:] + # Add this to comments field + data['comments'].insert(0, "Pub. Date: %s" % pub_date) + + elif name == 'desc': + # Find biggest size + max = 0 + for i in range(len(matches[name])): + if len(matches[name][i]) > len(matches[name][max]): + max = i + data['comments'].append(matches[name][max].strip()) + + elif name == 'writer': + # We may find several writers + data[name] = [] + writersList = re.sub('</?a.*?>', '', matches[name].group('writer')).split(',') + for d in writersList: + data[name].append(d.strip()) + + elif name == 'cover_artist': + data['artist']['Cover Artist'] = matches[name].group('cover_artist').strip() + + elif name == 'penciller': + data['artist']['Penciller'] = matches[name].group('penciller').strip() + + elif name == 'inker': + data['artist']['Inker'] = matches[name].group('inker').strip() + + elif name == 'colorist': + data['artist']['Colorist'] = matches[name].group('colorist').strip() + + elif name == 'letterer': + data['artist']['Letterer'] = matches[name].group('letterer').strip() + + elif name == 'genre': + # We may find several genres + data[name] = [] + genresList = re.sub('</?a.*?>', '', matches[name].group('genre')).split(',') + for d in genresList: + data[name].append(d.strip()) + + elif name == 'format': + format = matches[name].group('format').strip() + data['comments'].insert(1, format) + m = re.search("(?P<pages>[0-9]+)", format) + if m: + data['pages'] = m.group('pages') + else: + data['pages'] = '' + + return data + + + def __getMovie(self, title): + if not len(title): return + + self.__title = title + self.__getHTMLContent("%s%s" % (self.__baseURL, self.__searchURL % urllib.quote(self.__title))) + + # Get all links + links = self.__fetchMovieLinks() + + # Now retrieve infos + if links: + for entry in links: + data = self.__fetchMovieInfo( url = self.__movieURL + entry[0] ) + # Add DC link (custom field) + data['darkhorse'] = "%s%s" % (self.__movieURL, entry[0]) + node = self.__domTree.addEntry(data) + # Print entries on-the-fly + #self.__domTree.printEntry(node) + else: + return None + +def halt(): + print "HALT." + sys.exit(0) + +def showUsage(): + print "Usage: %s comic" % sys.argv[0] + sys.exit(1) + +def main(): + if len(sys.argv) < 2: + showUsage() + + parser = DarkHorseParser() + parser.run(sys.argv[1]) + +if __name__ == '__main__': + main() diff --git a/src/fetch/scripts/dark_horse_comics.py.spec b/src/fetch/scripts/dark_horse_comics.py.spec new file mode 100644 index 0000000..9481dc8 --- /dev/null +++ b/src/fetch/scripts/dark_horse_comics.py.spec @@ -0,0 +1,7 @@ +Name=Dark Horse Comics +Type=data-source +ArgumentKeys=1 +Arguments=%1 +CollectionType=6 +FormatType=0 +UpdateArgs=%{title} diff --git a/src/fetch/scripts/fr.allocine.py b/src/fetch/scripts/fr.allocine.py new file mode 100755 index 0000000..97a2247 --- /dev/null +++ b/src/fetch/scripts/fr.allocine.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python +# -*- coding: iso-8859-1 -*- + +# *************************************************************************** +# copyright : (C) 2006 by Mathias Monnerville +# email : tellico@monnerville.com +# *************************************************************************** +# +# *************************************************************************** +# * * +# * This program is free software; you can redistribute it and/or modify * +# * it under the terms of version 2 of the GNU General Public License as * +# * published by the Free Software Foundation; * +# * * +# *************************************************************************** + +# Version 0.4: 2007-08-27 +# * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres +# could not be retrieved. Fixed bad http request error due to some changes in HTML code. +# +# Version 0.3: +# * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed. +# +# Version 0.2: +# * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore. +# +# Version 0.1: +# * Initial release. + +import sys, os, re, md5, random +import urllib, urllib2, time, base64 +import xml.dom.minidom + +XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>""" +DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">""" + +VERSION = "0.4" + +def genMD5(): + obj = md5.new() + float = random.random() + obj.update(str(float)) + return obj.hexdigest() + +class BasicTellicoDOM: + def __init__(self): + self.__doc = xml.dom.minidom.Document() + self.__root = self.__doc.createElement('tellico') + self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') + self.__root.setAttribute('syntaxVersion', '9') + + self.__collection = self.__doc.createElement('collection') + self.__collection.setAttribute('title', 'My Movies') + self.__collection.setAttribute('type', '3') + + self.__fields = self.__doc.createElement('fields') + # Add all default (standard) fields + self.__dfltField = self.__doc.createElement('field') + self.__dfltField.setAttribute('name', '_default') + + # Add a custom 'Collection' field + self.__customField = self.__doc.createElement('field') + self.__customField.setAttribute('name', 'titre-original') + self.__customField.setAttribute('title', 'Original Title') + self.__customField.setAttribute('flags', '8') + self.__customField.setAttribute('category', 'General') + self.__customField.setAttribute('format', '1') + self.__customField.setAttribute('type', '1') + self.__customField.setAttribute('i18n', 'yes') + + self.__fields.appendChild(self.__dfltField) + self.__fields.appendChild(self.__customField) + self.__collection.appendChild(self.__fields) + + self.__images = self.__doc.createElement('images') + + self.__root.appendChild(self.__collection) + self.__doc.appendChild(self.__root) + + # Current movie id + self.__currentId = 0 + + + def addEntry(self, movieData): + """ + Add a movie entry + """ + d = movieData + entryNode = self.__doc.createElement('entry') + entryNode.setAttribute('id', str(self.__currentId)) + + titleNode = self.__doc.createElement('title') + titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8'))) + + otitleNode = self.__doc.createElement('titre-original') + otitleNode.appendChild(self.__doc.createTextNode(unicode(d['otitle'], 'latin-1').encode('utf-8'))) + + yearNode = self.__doc.createElement('year') + yearNode.appendChild(self.__doc.createTextNode(unicode(d['year'], 'latin-1').encode('utf-8'))) + + genresNode = self.__doc.createElement('genres') + for g in d['genres']: + genreNode = self.__doc.createElement('genre') + genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) + genresNode.appendChild(genreNode) + + natsNode = self.__doc.createElement('nationalitys') + natNode = self.__doc.createElement('nat') + natNode.appendChild(self.__doc.createTextNode(unicode(d['nat'], 'latin-1').encode('utf-8'))) + natsNode.appendChild(natNode) + + castsNode = self.__doc.createElement('casts') + for g in d['actors']: + castNode = self.__doc.createElement('cast') + col1Node = self.__doc.createElement('column') + col2Node = self.__doc.createElement('column') + col1Node.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) + castNode.appendChild(col1Node) + castNode.appendChild(col2Node) + castsNode.appendChild(castNode) + + dirsNode = self.__doc.createElement('directors') + for g in d['dirs']: + dirNode = self.__doc.createElement('director') + dirNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) + dirsNode.appendChild(dirNode) + + timeNode = self.__doc.createElement('running-time') + timeNode.appendChild(self.__doc.createTextNode(unicode(d['time'], 'latin-1').encode('utf-8'))) + + allocineNode = self.__doc.createElement(unicode('allociné-link', 'latin-1').encode('utf-8')) + allocineNode.appendChild(self.__doc.createTextNode(unicode(d['allocine'], 'latin-1').encode('utf-8'))) + + plotNode = self.__doc.createElement('plot') + plotNode.appendChild(self.__doc.createTextNode(unicode(d['plot'], 'latin-1').encode('utf-8'))) + + if d['image']: + imageNode = self.__doc.createElement('image') + imageNode.setAttribute('format', 'JPEG') + imageNode.setAttribute('id', d['image'][0]) + imageNode.setAttribute('width', '120') + imageNode.setAttribute('height', '160') + imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8'))) + + coverNode = self.__doc.createElement('cover') + coverNode.appendChild(self.__doc.createTextNode(d['image'][0])) + + for name in ( 'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'natsNode', + 'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode' ): + entryNode.appendChild(eval(name)) + + if d['image']: + entryNode.appendChild(coverNode) + self.__images.appendChild(imageNode) + + self.__collection.appendChild(entryNode) + + self.__currentId += 1 + + def printXML(self): + """ + Outputs XML content to stdout + """ + self.__collection.appendChild(self.__images) + print XML_HEADER; print DOCTYPE + print self.__root.toxml() + + +class AlloCineParser: + def __init__(self): + self.__baseURL = 'http://www.allocine.fr' + self.__basePath = '/film/fichefilm_gen_cfilm' + self.__searchURL= 'http://www.allocine.fr/recherche/?motcle=%s&f=3&rub=1' + self.__movieURL = self.__baseURL + self.__basePath + + # Define some regexps + self.__regExps = { 'title' : '<title>(?P<title>.+?)</title>', + 'dirs' : 'Réalisé par <a.*?>(?P<step1>.+?)</a>.*?</h4>', + 'actors' : '<h4>Avec *<a.*?>(?P<step1>.+)</a> ', + 'nat' : '<h4>Film *(?P<nat>.+?)[,\.]', + 'genres' : '<h4>Genre *: *<a.*?>(?P<step1>.+?)</a></h4>', + 'time' : '<h4>Durée *: *(?P<hours>[0-9])?h *(?P<mins>[0-9]{1,2})min', + 'year' : 'Année de production *: *(?P<year>[0-9]{4})', + # Original movie title + 'otitle' : 'Titre original *: *<i>(?P<otitle>.+?)</i>', + 'plot' : """(?s)<td valign="top" style="padding:10 0 0 0"><div align="justify"><h4> *(?P<plot>.+?) *</h4>""", + 'image' : """<td valign="top" width="120".*?<img src="(?P<image>.+?)" border"""} + + + self.__domTree = BasicTellicoDOM() + + def run(self, title): + """ + Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree + to stdout (in tellico format) so that tellico can use it. + """ + self.__getMovie(title) + # Print results to stdout + self.__domTree.printXML() + + def __getHTMLContent(self, url): + """ + Fetch HTML data from url + """ + + u = urllib2.urlopen(url) + self.__data = u.read() + u.close() + + def __fetchMovieLinks(self): + """ + Retrieve all links related to movie + """ + matchList = re.findall("""<h4><a *href="%s=(?P<page>.*?\.html?)" *class="link1">(?P<title>.*?)</a>""" % self.__basePath, self.__data) + if not matchList: return None + + return matchList + + def __fetchMovieInfo(self, url): + """ + Looks for movie information + """ + self.__getHTMLContent(url) + + matches = data = {} + + for name, regexp in self.__regExps.iteritems(): + if name == 'image': + matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I) + else: + matches[name] = re.search(regexp, self.__data) + + if matches[name]: + if name == 'title': + data[name] = matches[name].group('title').strip() + elif name == 'dirs': + dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') + data[name] = [] + for d in dirsList: + data[name].append(d.strip()) + + elif name == 'actors': + actorsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') + data[name] = [] + for d in actorsList: + data[name].append(d.strip()) + + elif name == 'nat': + data[name] = matches[name].group('nat').strip() + + elif name == 'genres': + genresList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') + data[name] = [] + for d in genresList: + data[name].append(d.strip()) + + elif name == 'time': + h, m = matches[name].group('hours'), matches[name].group('mins') + totmin = int(h)*60+int(m) + data[name] = str(totmin) + + elif name == 'year': + data[name] = matches[name].group('year').strip() + + elif name == 'otitle': + data[name] = matches[name].group('otitle').strip() + + elif name == 'plot': + data[name] = matches[name].group('plot').strip() + + # Image path + elif name == 'image': + # Save image to a temporary folder + md5 = genMD5() + imObj = urllib2.urlopen(matches[name][0].strip()) + img = imObj.read() + imObj.close() + imgPath = "/tmp/%s.jpeg" % md5 + try: + f = open(imgPath, 'w') + f.write(img) + f.close() + except: + # Could be great if we can pass exit code and some message + # to tellico in case of failure... + pass + + data[name] = (md5 + '.jpeg', base64.encodestring(img)) + # Delete temporary image + try: + os.remove(imgPath) + except: + # Could be great if we can pass exit code and some msg + # to tellico in case of failure... + pass + else: + matches[name] = '' + + return data + + + def __getMovie(self, title): + if not len(title): return + + self.__title = title + self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title)) + + # Get all links + links = self.__fetchMovieLinks() + + # Now retrieve infos + if links: + for entry in links: + data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]) ) + # Add allocine link (custom field) + data['allocine'] = "%s=%s" % (self.__movieURL, entry[0]) + self.__domTree.addEntry(data) + else: + return None + + + +def showUsage(): + print "Usage: %s movietitle" % sys.argv[0] + sys.exit(1) + +def main(): + if len(sys.argv) < 2: + showUsage() + + parser = AlloCineParser() + parser.run(sys.argv[1]) + +if __name__ == '__main__': + main() diff --git a/src/fetch/scripts/fr.allocine.py.spec b/src/fetch/scripts/fr.allocine.py.spec new file mode 100644 index 0000000..773b951 --- /dev/null +++ b/src/fetch/scripts/fr.allocine.py.spec @@ -0,0 +1,7 @@ +Name=Allocine.fr +Type=data-source +ArgumentKeys=1 +Arguments=%1 +CollectionType=3 +FormatType=0 +UpdateArgs=%{title} diff --git a/src/fetch/scripts/ministerio_de_cultura.py b/src/fetch/scripts/ministerio_de_cultura.py new file mode 100644 index 0000000..8a768f9 --- /dev/null +++ b/src/fetch/scripts/ministerio_de_cultura.py @@ -0,0 +1,595 @@ +#!/usr/bin/env python +# -*- coding: iso-8859-1 -*- + +# *************************************************************************** +# copyright : (C) 2006-2008 by Mathias Monnerville +# email : tellico@monnerville.com +# *************************************************************************** +# +# *************************************************************************** +# * * +# * This program is free software; you can redistribute it and/or modify * +# * it under the terms of version 2 of the GNU General Public License as * +# * published by the Free Software Foundation; * +# * * +# *************************************************************************** + +# $Id: books_ministerio_de_cultura.py 428 2007-03-07 13:17:17Z mathias $ + +""" +This script has to be used with tellico (http://periapsis.org/tellico) as an external data source program. +It allows searching for books in Spanish Ministry of Culture's database (at http://www.mcu.es/bases/spa/isbn/ISBN.html). + +Multiple ISBN/UPC searching is supported through the -m option: + ./books_ministerio_de_cultura.py -m filename +where filename holds one ISBN or UPC per line. + +Tellico data source setup: +- Source type: External Application +- Source name: Ministerio de Cultura (ES) (or whatever you want :) +- Collection type: Book Collection +- Result type: Tellico +- Path: /path/to/script/books_ministerio_de_cultura.py +- Arguments: +Title (checked) = -t %1 +Person (checked) = -a %1 +ISBN (checked) = -i %1 +UPC (checked) = -i %1 +Update (checked) = %{title} + +** Please note that this script is also part of the Tellico's distribution. +** You will always find the latest version in the SVN trunk of Tellico + +SVN Version: + * Removes translators for Authors List + * Adds translators to translator field + * Change from "Collection" to "Series" + * Process "Series Number" + * Adds in comments "ed.lit." authors + * If there isn't connection to Spanish Ministry of Culture + shows a nice error message (timeout: 5 seconds) + * Removed "translated from/to" from Comments field as already + exists in "Publishing" field + * Removed "Collection" field as I moved to Series/Series Number + +Version 0.3.2: + * Now find 'notas' field related information + * search URL modified to fetch information of exhausted books too + +Version 0.3.1: +Bug Fixes: + * The 'tr.' string does not appear among authors anymore + * Fixed an AttributeError exception related to a regexp matching the number of pages + +Version 0.3: +Bug Fixes: + * URL of the search engine has changed: + http://www.mcu.es/bases/spa/isbn/ISBN.html is now http://www.mcu.es/comun/bases/isbn/ISBN.html + * All the regexps have been rewritten to match the new site's content + +Version 0.2: +New features: + * Support for multiple ISBN/UPC searching (support from command line with -m option) + * Default books collection enhanced with a new custom field 'Collection' + * Search extended for both available and exhausted books + * Hyphens are stripped out in the ISBN (or UPC) search + +Bug Fixes: + * Publication year now holds only the year + * ISBN regexp fix + * Fix for publisher field (values were inverted) + * -i parameter works for both ISBN and UPC based search + +Version 0.1: + * Initial Release +""" + +import sys, os, re, md5, random, string +import urllib, urllib2, time, base64 +import xml.dom.minidom, types +import socket + +XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>""" +DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">""" +NULLSTRING = '' + +VERSION = "0.3.2" + +ISBN, AUTHOR, TITLE = range(3) + +TRANSLATOR_STR = "tr." +EDLIT_STR = "ed. lit." + +class EngineError(Exception): pass + +class BasicTellicoDOM: + """ + This class manages tellico's XML data model (DOM) + """ + def __init__(self): + self.__doc = xml.dom.minidom.Document() + self.__root = self.__doc.createElement('tellico') + self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') + self.__root.setAttribute('syntaxVersion', '9') + + self.__collection = self.__doc.createElement('collection') + self.__collection.setAttribute('title', 'My Books') + self.__collection.setAttribute('type', '2') + + self.__fields = self.__doc.createElement('fields') + # Add all default (standard) fields + self.__dfltField = self.__doc.createElement('field') + self.__dfltField.setAttribute('name', '_default') + + # Add a custom 'Collection' field (Left by reference for + # the future) + #self.__customCollectionField = self.__doc.createElement('field') + #self.__customCollectionField.setAttribute('name', 'book_collection') + #self.__customCollectionField.setAttribute('title', 'Collection') + #self.__customCollectionField.setAttribute('flags', '7') + #self.__customCollectionField.setAttribute('category', 'Classification') + #self.__customCollectionField.setAttribute('format', '0') + #self.__customCollectionField.setAttribute('type', '1') + #self.__customCollectionField.setAttribute('i18n', 'yes') + + + self.__fields.appendChild(self.__dfltField) + #self.__fields.appendChild(self.__customCollectionField) + self.__collection.appendChild(self.__fields) + + self.__root.appendChild(self.__collection) + self.__doc.appendChild(self.__root) + + # Current movie id. See entry's id attribute in self.addEntry() + self.__currentId = 0 + + + def addEntry(self, movieData): + """ + Add a comic entry. + Returns an entry node instance + """ + + d = movieData + + # Convert all strings to UTF-8 + for i in d.keys(): + if type(d[i]) == types.ListType: + d[i] = [unicode(d[i][j], 'latin-1').encode('utf-8') for j in range(len(d[i]))] + elif type(d[i]) == types.StringType: + d[i] = unicode(d[i], 'latin-1').encode('utf-8') + + entryNode = self.__doc.createElement('entry') + entryNode.setAttribute('id', str(self.__currentId)) + + titleNode = self.__doc.createElement('title') + titleNode.appendChild(self.__doc.createTextNode(d['title'])) + + yearNode = self.__doc.createElement('pub_year') + yearNode.appendChild(self.__doc.createTextNode(d['pub_year'])) + + pubNode = self.__doc.createElement('publisher') + pubNode.appendChild(self.__doc.createTextNode(d['publisher'])) + + langsNode = self.__doc.createElement('languages') + for l in d['language']: + langNode = self.__doc.createElement('language') + langNode.appendChild(self.__doc.createTextNode(l)) + langsNode.appendChild(langNode) + + keywordsNode = self.__doc.createElement('keywords') + keywordNode = self.__doc.createElement('keyword') + keywordNode.appendChild(self.__doc.createTextNode(d['keyword'])) + keywordsNode.appendChild(keywordNode) + + edNode = self.__doc.createElement('edition') + edNode.appendChild(self.__doc.createTextNode(d['edition'])) + + writersNode = self.__doc.createElement('authors') + for g in d['author']: + writerNode = self.__doc.createElement('author') + writerNode.appendChild(self.__doc.createTextNode(g)) + writersNode.appendChild(writerNode) + + commentsNode = self.__doc.createElement('comments') + commentsData = string.join(d['comments'], '<br/>') + commentsNode.appendChild(self.__doc.createTextNode(commentsData)) + + pagesNode = self.__doc.createElement('pages') + pagesNode.appendChild(self.__doc.createTextNode(d['pages'])) + + isbnNode = self.__doc.createElement('isbn') + isbnNode.appendChild(self.__doc.createTextNode(d['isbn'])) + + priceNode = self.__doc.createElement('pur_price') + priceNode.appendChild(self.__doc.createTextNode(d['pur_price'])) + + seriesNode = self.__doc.createElement('series') + seriesNode.appendChild(self.__doc.createTextNode(d['series'])) + + seriesNumNode = self.__doc.createElement('series_num') + seriesNumNode.appendChild(self.__doc.createTextNode(d['series_num'])) + + translatorNode = self.__doc.createElement('translator') + translatorNode.appendChild(self.__doc.createTextNode(d['translator'])) + + for name in ( 'title', 'year', 'pub', 'langs', 'keyword', 'ed', 'writers', + 'comments', 'pages', 'isbn', 'price', 'series', 'seriesNum', 'translator' ): + entryNode.appendChild(eval(name + 'Node')) + + self.__collection.appendChild(entryNode) + self.__currentId += 1 + + return entryNode + + def printEntry(self, nEntry): + """ + Prints entry's XML content to stdout + """ + + try: + print nEntry.toxml() + except: + print sys.stderr, "Error while outputing XML content from entry to Tellico" + + def printXMLTree(self): + """ + Outputs XML content to stdout + """ + + print XML_HEADER; print DOCTYPE + print self.__root.toxml() + + +class MinisterioCulturaParser: + def __init__(self): + # Search form is at http://www.mcu.es/comun/bases/isbn/ISBN.html + self.__baseURL = 'http://www.mcu.es' + self.__searchURL = '/cgi-brs/BasesHTML/isbn/BRSCGI?CMD=VERLST&BASE=ISBN&DOCS=1-15&CONF=AEISPA.cnf&OPDEF=AND&SEPARADOR=' + \ + '&WDIS-C=DISPONIBLE+or+AGOTADO&WGEN-C=&WISB-C=%s&WAUT-C=%s&WTIT-C=%s&WMAT-C=&WEDI-C=&' + + self.__suffixURL = 'WFEP-C=&%40T353-GE=&%40T353-LE=&WSER-C=&WLUG-C=&WLEN-C=&WCLA-C=&WSOP-C=' + + # Define some regexps + self.__regExps = { 'author' : '<th scope="row">Autor:.*?<td>(?P<author>.*?)</td>', + 'isbn' : '<span class="cabTitulo">ISBN.*?<strong>(?P<isbn>.*?)</strong>', # Matches ISBN 13 + 'title' : '<th scope="row">Título:.*?<td>(?P<title>.*?)</td>', + 'language' : '<th scope="row">Lengua:.*?<td>(?P<language>.*?)</td>', + 'edition' : '<th scope="row">Edición:.*?<td>.*?<span>(?P<edition>.*?)</span>', + 'pur_price' : '<th scope="row">Precio:.*?<td>.*?<span>(?P<pur_price>.*?)€</span>', + 'desc' : '<th scope="row">Descripción:.*?<td>.*?<span>(?P<desc>.*?)</span>', + 'publication' : '<th scope="row">Publicación:.*?<td>.*?<span>(?P<publication>.*?)</span>', + 'keyword' : '<th scope="row">Materias:.*?<td>.*?<span>(?P<keywords>.*?)</span>', + 'notas' : '<th scope="row">Notas:.*?<td>.*?<span>(?P<notas>.*?)</span>', + 'cdu' : '<th scope="row">CDU:.*?<td><span>(?P<cdu>.*?)</span></td>', + 'encuadernacion': '<th scope="row">Encuadernación:.*?<td>.*?<span>(?P<encuadernacion>.*?)</span>', + 'series' : '<th scope="row">Colección:.*?<td>.*?<span>(?P<series>.*?)</span>' + } + + # Compile patterns objects + self.__regExpsPO = {} + for k, pattern in self.__regExps.iteritems(): + self.__regExpsPO[k] = re.compile(pattern) + + self.__domTree = BasicTellicoDOM() + + def run(self, criteria, kind): + """ + Runs the parser: fetch book related links, then fills and prints the DOM tree + to stdout (in tellico format) so that tellico can use it. + """ + + # Strip out hyphens if kind is ISBN + if kind == ISBN: + criteria = criteria.replace('-', NULLSTRING) + # Support for multiple search + isbnList = criteria.split(';') + for n in isbnList: + self.__getBook(n, kind) + else: + self.__getBook(criteria, kind) + + # Print results to stdout + self.__domTree.printXMLTree() + + def __getHTMLContent(self, url): + """ + Fetch HTML data from url + """ + + try: + u = urllib2.urlopen(url) + except Exception, e: + u.close() + sys.exit(""" +Network error while getting HTML content. +Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: +'%s'""" % e) + + + self.__data = u.read() + u.close() + + def __fetchBookLinks(self): + """ + Retrieve all links related to the search. self.__data contains HTML content fetched by self.__getHTMLContent() + that need to be parsed. + """ + + matchList = re.findall("""<div class="isbnResDescripcion">.*?<p>.*?<A target="_top" HREF="(?P<url>.*?)">""", self.__data, re.S) + + if not matchList: return None + return matchList + + def __fetchBookInfo(self, url): + """ + Looks for book information + """ + + self.__getHTMLContent(url) + + matches = {} + data = {} + + data['comments'] = [] + # Empty string if series not available + data['series_num'] = NULLSTRING + data['translator'] = NULLSTRING + + for name, po in self.__regExpsPO.iteritems(): + data[name] = NULLSTRING + matches[name] = re.search(self.__regExps[name], self.__data, re.S | re.I) + + + if matches[name]: + if name == 'title': + d = matches[name].group('title').strip() + d = re.sub('<.?strong>', NULLSTRING, d) + d = re.sub('\n', NULLSTRING, d) + data['title'] = d + + elif name == 'isbn': + data['isbn'] = matches[name].group('isbn').strip() + + elif name == 'edition': + data['edition'] = matches[name].group('edition').strip() + + elif name == 'pur_price': + d = matches[name].group('pur_price') + data['pur_price'] = d.strip() + ' EUR' + + elif name == 'publication': + d = matches[name].group('publication') + for p in ('</?[Aa].*?>', ' ', ':', ','): + d = re.sub(p, NULLSTRING, d) + + d = d.split('\n') + # d[1] is an empty string + data['publisher'] = "%s (%s)" % (d[2], d[0]) + data['pub_year'] = re.sub('\d{2}\/', NULLSTRING, d[3]) + del data['publication'] + + elif name == 'desc': + d = matches[name].group('desc') + m = re.search('\d+ ', d) + # When not available + data['pages'] = NULLSTRING + if m: + data['pages'] = m.group(0).strip() + m = re.search('; (?P<format>.*cm)', d) + if m: + data['comments'].append('Format: ' + m.group('format').strip()) + del data['desc'] + + elif name == 'encuadernacion': + data['comments'].append(matches[name].group('encuadernacion').strip()) + + elif name == 'keyword': + d = matches[name].group('keywords') + d = re.sub('</?[Aa].*?>', NULLSTRING, d) + data['keyword'] = d.strip() + + elif name == 'cdu': + data['comments'].append('CDU: ' + matches[name].group('cdu').strip()) + + elif name == 'notas': + data['comments'].append(matches[name].group('notas').strip()) + + elif name == 'series': + d = matches[name].group('series').strip() + d = re.sub(' ', ' ', d) + data[name] = d + # data[name] can contain something like 'Byblos, 162/24' + + # Maybe better to add the reg exp to get seriesNum in self.__regExps + p = re.compile('[0-9]+$') + s = re.search(p, data[name]) + + if s: + # if series ends with a number, it seems that is a + # number of the book inside the series. We save in seriesNum + data['series_num'] = s.group() + + # it removes lasts digits (plus one because is space or /) from + # data['series'] + l = len(data['series_num']) + 1 + data[name] = data[name][0:-l] + data[name] = data[name].rstrip(",") # remove the , between series and series_num + + elif name == 'author': + # We may find several authors + data[name] = [] + authorsList = re.findall('<a.*?>(?P<author>.*?)</a>', matches[name].group('author'), re.S | re.I) + if not authorsList: + # No href links + authors = re.search('<li>(?P<author>.*?)</li>', matches[name].group('author'), re.S | re.I) + try: + results = authors.group('author').strip().split(',') + except AttributeError: + results = [] + results = [r.strip() for r in results] + data[name] = results + else: + for d in authorsList: + # Sometimes, the search engine outputs some image between a elements + if d.strip()[:4] != '<img': + data[name].append(d.strip()) + + # Move tr authors (translators) to translators list + translator = self.__getSpecialRol(data[name], TRANSLATOR_STR) + edlit = self.__getSpecialRol(data[name], EDLIT_STR) + data[name] = self.__removeSpecialsFromAuthors(data[name], translator, TRANSLATOR_STR) + data[name] = self.__removeSpecialsFromAuthors(data[name], edlit, EDLIT_STR) + + if len(translator) > 0: + data['translator'] = self.__formatSpecials(translator, NULLSTRING) + + if len(edlit) > 0: + data['comments'].append(self.__formatSpecials(edlit, "Editor Literario: ")) + + elif name == 'language': + # We may find several languages + d = matches[name].group('language') + d = re.sub('\n', NULLSTRING, d) + d = d.split('<span>') + a = [] + for lg in d: + if len(lg): + lg = re.sub('</span>', NULLSTRING, lg) + # Because HTML is not interpreted in the 'language' field of Tellico + lg = re.sub('ó', 'o', lg) + a.append(lg.strip()) + # Removes that word so that only the language name remains. + a[0] = re.sub('publicacion: ', NULLSTRING, a[0]) + data['language'] = a + # Add other language related info to the 'comments' field too + #for lg in a[1:]: + #data['comments'].append(lg) + + return data + + + def __getBook(self, data, kind = ISBN): + if not len(data): + raise EngineError, "No data given. Unable to proceed." + + if kind == ISBN: + self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \ + (urllib.quote(data), # ISBN + NULLSTRING, # AUTHOR + NULLSTRING), # TITLE + self.__suffixURL) + ) + elif kind == AUTHOR: + self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \ + (NULLSTRING, # ISBN + urllib.quote(data), # AUTHOR + NULLSTRING), # TITLE + self.__suffixURL) + ) + + elif kind == TITLE: + self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \ + (NULLSTRING, # ISBN + NULLSTRING, # AUTHOR + urllib.quote(data)), # TITLE + self.__suffixURL) + ) + + # Get all links + links = self.__fetchBookLinks() + + # Now retrieve infos + if links: + for entry in links: + data = self.__fetchBookInfo( url = self.__baseURL + entry.replace(' ', '%20') ) + node = self.__domTree.addEntry(data) + else: + return None + + def __getSpecialRol(self, authors, special): + """ + Receives a list like ['Stephen King','Lorenzo Cortina','tr.', + 'Rosalía Vázquez','tr.'] and returns a list with special names + """ + + j = 0; max = len(authors) + special_rol = [] + while j < max: + if authors[j] == special: + special_rol.append(authors[j-1]) + j += 1 + + return special_rol + + def __removeSpecialsFromAuthors(self, authors, specials, string): + """ + Receives a list with authors+translators and removes 'tr.' and + authors from there. Example: + authors: ['Stephen King','Lorenzo Cortina','tr.','Rosalía Vázquez','tr.'] + translators: ['Lorenzo Cortina','Rosalía Vázquez'] + returns: ['Stephen King'] + + (We could also guess string value because is the next position + in authors list) + """ + + newauthors = authors[:] + + for t in specials: + newauthors.remove(t) + newauthors.remove(string) + + return newauthors + + def __formatSpecials(self, translators, prefix): + """ + Receives a list with translators and returns a string + (authors are handled different: each author in a different node) + """ + + return prefix + string.join(translators, '; ') + +def halt(): + print "HALT." + sys.exit(0) + +def showUsage(): + print """Usage: %s options +Where options are: + -t title + -i (ISBN|UPC) + -a author + -m filename (support for multiple ISBN/UPC search)""" % sys.argv[0] + sys.exit(1) + +def main(): + if len(sys.argv) < 3: + showUsage() + + socket.setdefaulttimeout(5) + + # ;-separated ISBNs string + isbnStringList = NULLSTRING + + opts = {'-t' : TITLE, '-i' : ISBN, '-a' : AUTHOR, '-m' : isbnStringList} + if sys.argv[1] not in opts.keys(): + showUsage() + + if sys.argv[1] == '-m': + try: + f = open(sys.argv[2], 'r') + data = f.readlines() + # remove trailing \n + sys.argv[2] = string.join([d[:-1] for d in data], ';') + sys.argv[1] = '-i' + f.close() + except IOError, e: + print "Error: %s" % e + sys.exit(1) + + parser = MinisterioCulturaParser() + parser.run(sys.argv[2], opts[sys.argv[1]]) + +if __name__ == '__main__': + main() diff --git a/src/fetch/scripts/ministerio_de_cultura.py.spec b/src/fetch/scripts/ministerio_de_cultura.py.spec new file mode 100644 index 0000000..ef24ac5 --- /dev/null +++ b/src/fetch/scripts/ministerio_de_cultura.py.spec @@ -0,0 +1,7 @@ +Name=Spanish Ministry of Culture +Type=data-source +ArgumentKeys=1,2,3,4 +Arguments=-t %1,-a %1,-i %1,-i %1 +CollectionType=2 +FormatType=0 +UpdateArgs=-t %{title} |