#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
# ***************************************************************************
# copyright : (C) 2006-2008 by Mathias Monnerville
# email : tellico@monnerville.com
# ***************************************************************************
#
# ***************************************************************************
# * *
# * This program is free software; you can redistribute it and/or modify *
# * it under the terms of version 2 of the GNU General Public License as *
# * published by the Free Software Foundation; *
# * *
# ***************************************************************************
# $Id: books_ministerio_de_cultura.py 428 2007-03-07 13:17:17Z mathias $
"""
This script has to be used with tellico (http://periapsis.org/tellico) as an external data source program.
It allows searching for books in Spanish Ministry of Culture's database (at http://www.mcu.es/bases/spa/isbn/ISBN.html).
Multiple ISBN/UPC searching is supported through the -m option:
./books_ministerio_de_cultura.py -m filename
where filename holds one ISBN or UPC per line.
Tellico data source setup:
- Source type: External Application
- Source name: Ministerio de Cultura (ES) (or whatever you want :)
- Collection type: Book Collection
- Result type: Tellico
- Path: /path/to/script/books_ministerio_de_cultura.py
- Arguments:
Title (checked) = -t %1
Person (checked) = -a %1
ISBN (checked) = -i %1
UPC (checked) = -i %1
Update (checked) = %{title}
** Please note that this script is also part of the Tellico's distribution.
** You will always find the latest version in the SVN trunk of Tellico
SVN Version:
* Removes translators for Authors List
* Adds translators to translator field
* Change from "Collection" to "Series"
* Process "Series Number"
* Adds in comments "ed.lit." authors
* If there isn't connection to Spanish Ministry of Culture
shows a nice error message (timeout: 5 seconds)
* Removed "translated from/to" from Comments field as already
exists in "Publishing" field
* Removed "Collection" field as I moved to Series/Series Number
Version 0.3.2:
* Now find 'notas' field related information
* search URL modified to fetch information of exhausted books too
Version 0.3.1:
Bug Fixes:
* The 'tr.' string does not appear among authors anymore
* Fixed an AttributeError exception related to a regexp matching the number of pages
Version 0.3:
Bug Fixes:
* URL of the search engine has changed:
http://www.mcu.es/bases/spa/isbn/ISBN.html is now http://www.mcu.es/comun/bases/isbn/ISBN.html
* All the regexps have been rewritten to match the new site's content
Version 0.2:
New features:
* Support for multiple ISBN/UPC searching (support from command line with -m option)
* Default books collection enhanced with a new custom field 'Collection'
* Search extended for both available and exhausted books
* Hyphens are stripped out in the ISBN (or UPC) search
Bug Fixes:
* Publication year now holds only the year
* ISBN regexp fix
* Fix for publisher field (values were inverted)
* -i parameter works for both ISBN and UPC based search
Version 0.1:
* Initial Release
"""
import sys, os, re, md5, random, string
import urllib, urllib2, time, base64
import xml.dom.minidom, types
import socket
XML_HEADER = """"""
DOCTYPE = """"""
NULLSTRING = ''
VERSION = "0.3.2"
ISBN, AUTHOR, TITLE = range(3)
TRANSLATOR_STR = "tr."
EDLIT_STR = "ed. lit."
class EngineError(Exception): pass
class BasicTellicoDOM:
"""
This class manages tellico's XML data model (DOM)
"""
def __init__(self):
self.__doc = xml.dom.minidom.Document()
self.__root = self.__doc.createElement('tellico')
self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
self.__root.setAttribute('syntaxVersion', '9')
self.__collection = self.__doc.createElement('collection')
self.__collection.setAttribute('title', 'My Books')
self.__collection.setAttribute('type', '2')
self.__fields = self.__doc.createElement('fields')
# Add all default (standard) fields
self.__dfltField = self.__doc.createElement('field')
self.__dfltField.setAttribute('name', '_default')
# Add a custom 'Collection' field (Left by reference for
# the future)
#self.__customCollectionField = self.__doc.createElement('field')
#self.__customCollectionField.setAttribute('name', 'book_collection')
#self.__customCollectionField.setAttribute('title', 'Collection')
#self.__customCollectionField.setAttribute('flags', '7')
#self.__customCollectionField.setAttribute('category', 'Classification')
#self.__customCollectionField.setAttribute('format', '0')
#self.__customCollectionField.setAttribute('type', '1')
#self.__customCollectionField.setAttribute('i18n', 'yes')
self.__fields.appendChild(self.__dfltField)
#self.__fields.appendChild(self.__customCollectionField)
self.__collection.appendChild(self.__fields)
self.__root.appendChild(self.__collection)
self.__doc.appendChild(self.__root)
# Current movie id. See entry's id attribute in self.addEntry()
self.__currentId = 0
def addEntry(self, movieData):
"""
Add a comic entry.
Returns an entry node instance
"""
d = movieData
# Convert all strings to UTF-8
for i in d.keys():
if type(d[i]) == types.ListType:
d[i] = [unicode(d[i][j], 'latin-1').encode('utf-8') for j in range(len(d[i]))]
elif type(d[i]) == types.StringType:
d[i] = unicode(d[i], 'latin-1').encode('utf-8')
entryNode = self.__doc.createElement('entry')
entryNode.setAttribute('id', str(self.__currentId))
titleNode = self.__doc.createElement('title')
titleNode.appendChild(self.__doc.createTextNode(d['title']))
yearNode = self.__doc.createElement('pub_year')
yearNode.appendChild(self.__doc.createTextNode(d['pub_year']))
pubNode = self.__doc.createElement('publisher')
pubNode.appendChild(self.__doc.createTextNode(d['publisher']))
langsNode = self.__doc.createElement('languages')
for l in d['language']:
langNode = self.__doc.createElement('language')
langNode.appendChild(self.__doc.createTextNode(l))
langsNode.appendChild(langNode)
keywordsNode = self.__doc.createElement('keywords')
keywordNode = self.__doc.createElement('keyword')
keywordNode.appendChild(self.__doc.createTextNode(d['keyword']))
keywordsNode.appendChild(keywordNode)
edNode = self.__doc.createElement('edition')
edNode.appendChild(self.__doc.createTextNode(d['edition']))
writersNode = self.__doc.createElement('authors')
for g in d['author']:
writerNode = self.__doc.createElement('author')
writerNode.appendChild(self.__doc.createTextNode(g))
writersNode.appendChild(writerNode)
commentsNode = self.__doc.createElement('comments')
commentsData = string.join(d['comments'], '
')
commentsNode.appendChild(self.__doc.createTextNode(commentsData))
pagesNode = self.__doc.createElement('pages')
pagesNode.appendChild(self.__doc.createTextNode(d['pages']))
isbnNode = self.__doc.createElement('isbn')
isbnNode.appendChild(self.__doc.createTextNode(d['isbn']))
priceNode = self.__doc.createElement('pur_price')
priceNode.appendChild(self.__doc.createTextNode(d['pur_price']))
seriesNode = self.__doc.createElement('series')
seriesNode.appendChild(self.__doc.createTextNode(d['series']))
seriesNumNode = self.__doc.createElement('series_num')
seriesNumNode.appendChild(self.__doc.createTextNode(d['series_num']))
translatorNode = self.__doc.createElement('translator')
translatorNode.appendChild(self.__doc.createTextNode(d['translator']))
for name in ( 'title', 'year', 'pub', 'langs', 'keyword', 'ed', 'writers',
'comments', 'pages', 'isbn', 'price', 'series', 'seriesNum', 'translator' ):
entryNode.appendChild(eval(name + 'Node'))
self.__collection.appendChild(entryNode)
self.__currentId += 1
return entryNode
def printEntry(self, nEntry):
"""
Prints entry's XML content to stdout
"""
try:
print nEntry.toxml()
except:
print sys.stderr, "Error while outputing XML content from entry to Tellico"
def printXMLTree(self):
"""
Outputs XML content to stdout
"""
print XML_HEADER; print DOCTYPE
print self.__root.toxml()
class MinisterioCulturaParser:
def __init__(self):
# Search form is at http://www.mcu.es/comun/bases/isbn/ISBN.html
self.__baseURL = 'http://www.mcu.es'
self.__searchURL = '/cgi-brs/BasesHTML/isbn/BRSCGI?CMD=VERLST&BASE=ISBN&DOCS=1-15&CONF=AEISPA.cnf&OPDEF=AND&SEPARADOR=' + \
'&WDIS-C=DISPONIBLE+or+AGOTADO&WGEN-C=&WISB-C=%s&WAUT-C=%s&WTIT-C=%s&WMAT-C=&WEDI-C=&'
self.__suffixURL = 'WFEP-C=&%40T353-GE=&%40T353-LE=&WSER-C=&WLUG-C=&WLEN-C=&WCLA-C=&WSOP-C='
# Define some regexps
self.__regExps = { 'author' : '