diff options
Diffstat (limited to 'src/fetch/scripts/ministerio_de_cultura.py')
-rw-r--r-- | src/fetch/scripts/ministerio_de_cultura.py | 92 |
1 files changed, 46 insertions, 46 deletions
diff --git a/src/fetch/scripts/ministerio_de_cultura.py b/src/fetch/scripts/ministerio_de_cultura.py index 8a768f9..19ea7be 100644 --- a/src/fetch/scripts/ministerio_de_cultura.py +++ b/src/fetch/scripts/ministerio_de_cultura.py @@ -37,10 +37,10 @@ ISBN (checked) = -i %1 UPC (checked) = -i %1 Update (checked) = %{title} -** Please note that this script is also part of the Tellico's distribution. +** Please note that this script is also part of the Tellico's distribution. ** You will always find the latest version in the SVN trunk of Tellico -SVN Version: +SVN Version: * Removes translators for Authors List * Adds translators to translator field * Change from "Collection" to "Series" @@ -85,7 +85,7 @@ Version 0.1: """ import sys, os, re, md5, random, string -import urllib, urllib2, time, base64 +import urllib.request, urllib.parse, urllib.error, time, base64 import xml.dom.minidom, types import socket @@ -95,7 +95,7 @@ NULLSTRING = '' VERSION = "0.3.2" -ISBN, AUTHOR, TITLE = range(3) +ISBN, AUTHOR, TITLE = list(range(3)) TRANSLATOR_STR = "tr." EDLIT_STR = "ed. lit." @@ -111,16 +111,16 @@ class BasicTellicoDOM: self.__root = self.__doc.createElement('tellico') self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') self.__root.setAttribute('syntaxVersion', '9') - + self.__collection = self.__doc.createElement('collection') self.__collection.setAttribute('title', 'My Books') self.__collection.setAttribute('type', '2') - self.__fields = self.__doc.createElement('fields') + self.__fields = self.__doc.createElement('fields') # Add all default (standard) fields - self.__dfltField = self.__doc.createElement('field') - self.__dfltField.setAttribute('name', '_default') - + self.__dfltField = self.__doc.createElement('field') + self.__dfltField.setAttribute('name', '_default') + # Add a custom 'Collection' field (Left by reference for # the future) #self.__customCollectionField = self.__doc.createElement('field') @@ -146,18 +146,18 @@ class BasicTellicoDOM: def addEntry(self, movieData): """ - Add a comic entry. + Add a comic entry. Returns an entry node instance """ d = movieData # Convert all strings to UTF-8 - for i in d.keys(): - if type(d[i]) == types.ListType: - d[i] = [unicode(d[i][j], 'latin-1').encode('utf-8') for j in range(len(d[i]))] - elif type(d[i]) == types.StringType: - d[i] = unicode(d[i], 'latin-1').encode('utf-8') + for i in list(d.keys()): + if type(d[i]) == list: + d[i] = [str(d[i][j], 'latin-1').encode('utf-8') for j in range(len(d[i]))] + elif type(d[i]) == bytes: + d[i] = str(d[i], 'latin-1').encode('utf-8') entryNode = self.__doc.createElement('entry') entryNode.setAttribute('id', str(self.__currentId)) @@ -213,7 +213,7 @@ class BasicTellicoDOM: translatorNode = self.__doc.createElement('translator') translatorNode.appendChild(self.__doc.createTextNode(d['translator'])) - for name in ( 'title', 'year', 'pub', 'langs', 'keyword', 'ed', 'writers', + for name in ( 'title', 'year', 'pub', 'langs', 'keyword', 'ed', 'writers', 'comments', 'pages', 'isbn', 'price', 'series', 'seriesNum', 'translator' ): entryNode.appendChild(eval(name + 'Node')) @@ -228,17 +228,17 @@ class BasicTellicoDOM: """ try: - print nEntry.toxml() + print(nEntry.toxml()) except: - print sys.stderr, "Error while outputing XML content from entry to Tellico" + print(sys.stderr, "Error while outputing XML content from entry to Tellico") def printXMLTree(self): """ Outputs XML content to stdout """ - print XML_HEADER; print DOCTYPE - print self.__root.toxml() + print(XML_HEADER); print(DOCTYPE) + print(self.__root.toxml()) class MinisterioCulturaParser: @@ -264,11 +264,11 @@ class MinisterioCulturaParser: 'cdu' : '<th scope="row">CDU:.*?<td><span>(?P<cdu>.*?)</span></td>', 'encuadernacion': '<th scope="row">Encuadernación:.*?<td>.*?<span>(?P<encuadernacion>.*?)</span>', 'series' : '<th scope="row">Colección:.*?<td>.*?<span>(?P<series>.*?)</span>' - } + } # Compile patterns objects self.__regExpsPO = {} - for k, pattern in self.__regExps.iteritems(): + for k, pattern in self.__regExps.items(): self.__regExpsPO[k] = re.compile(pattern) self.__domTree = BasicTellicoDOM() @@ -296,10 +296,10 @@ class MinisterioCulturaParser: """ Fetch HTML data from url """ - + try: - u = urllib2.urlopen(url) - except Exception, e: + u = urllib.request.urlopen(url) + except Exception as e: u.close() sys.exit(""" Network error while getting HTML content. @@ -312,7 +312,7 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: def __fetchBookLinks(self): """ - Retrieve all links related to the search. self.__data contains HTML content fetched by self.__getHTMLContent() + Retrieve all links related to the search. self.__data contains HTML content fetched by self.__getHTMLContent() that need to be parsed. """ @@ -333,10 +333,10 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: data['comments'] = [] # Empty string if series not available - data['series_num'] = NULLSTRING + data['series_num'] = NULLSTRING data['translator'] = NULLSTRING - for name, po in self.__regExpsPO.iteritems(): + for name, po in self.__regExpsPO.items(): data[name] = NULLSTRING matches[name] = re.search(self.__regExps[name], self.__data, re.S | re.I) @@ -391,22 +391,22 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: elif name == 'cdu': data['comments'].append('CDU: ' + matches[name].group('cdu').strip()) - + elif name == 'notas': data['comments'].append(matches[name].group('notas').strip()) - + elif name == 'series': d = matches[name].group('series').strip() d = re.sub(' ', ' ', d) data[name] = d # data[name] can contain something like 'Byblos, 162/24' - # Maybe better to add the reg exp to get seriesNum in self.__regExps + # Maybe better to add the reg exp to get seriesNum in self.__regExps p = re.compile('[0-9]+$') s = re.search(p, data[name]) if s: - # if series ends with a number, it seems that is a + # if series ends with a number, it seems that is a # number of the book inside the series. We save in seriesNum data['series_num'] = s.group() @@ -434,7 +434,7 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: # Sometimes, the search engine outputs some image between a elements if d.strip()[:4] != '<img': data[name].append(d.strip()) - + # Move tr authors (translators) to translators list translator = self.__getSpecialRol(data[name], TRANSLATOR_STR) edlit = self.__getSpecialRol(data[name], EDLIT_STR) @@ -470,12 +470,12 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: def __getBook(self, data, kind = ISBN): - if not len(data): - raise EngineError, "No data given. Unable to proceed." + if not len(data): + raise EngineError("No data given. Unable to proceed.") if kind == ISBN: self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \ - (urllib.quote(data), # ISBN + (urllib.parse.quote(data), # ISBN NULLSTRING, # AUTHOR NULLSTRING), # TITLE self.__suffixURL) @@ -483,7 +483,7 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: elif kind == AUTHOR: self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \ (NULLSTRING, # ISBN - urllib.quote(data), # AUTHOR + urllib.parse.quote(data), # AUTHOR NULLSTRING), # TITLE self.__suffixURL) ) @@ -492,7 +492,7 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \ (NULLSTRING, # ISBN NULLSTRING, # AUTHOR - urllib.quote(data)), # TITLE + urllib.parse.quote(data)), # TITLE self.__suffixURL) ) @@ -519,12 +519,12 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: if authors[j] == special: special_rol.append(authors[j-1]) j += 1 - + return special_rol def __removeSpecialsFromAuthors(self, authors, specials, string): """ - Receives a list with authors+translators and removes 'tr.' and + Receives a list with authors+translators and removes 'tr.' and authors from there. Example: authors: ['Stephen King','Lorenzo Cortina','tr.','Rosalía Vázquez','tr.'] translators: ['Lorenzo Cortina','Rosalía Vázquez'] @@ -551,16 +551,16 @@ Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage: return prefix + string.join(translators, '; ') def halt(): - print "HALT." + print("HALT.") sys.exit(0) def showUsage(): - print """Usage: %s options + print("""Usage: %s options Where options are: -t title -i (ISBN|UPC) -a author - -m filename (support for multiple ISBN/UPC search)""" % sys.argv[0] + -m filename (support for multiple ISBN/UPC search)""" % sys.argv[0]) sys.exit(1) def main(): @@ -573,7 +573,7 @@ def main(): isbnStringList = NULLSTRING opts = {'-t' : TITLE, '-i' : ISBN, '-a' : AUTHOR, '-m' : isbnStringList} - if sys.argv[1] not in opts.keys(): + if sys.argv[1] not in list(opts.keys()): showUsage() if sys.argv[1] == '-m': @@ -584,8 +584,8 @@ def main(): sys.argv[2] = string.join([d[:-1] for d in data], ';') sys.argv[1] = '-i' f.close() - except IOError, e: - print "Error: %s" % e + except IOError as e: + print("Error: %s" % e) sys.exit(1) parser = MinisterioCulturaParser() |