src/translators/bibtexhandler.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319

/***************************************************************************
    copyright            : (C) 2003-2006 by Robby Stephenson
    email                : robby@periapsis.org
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of version 2 of the GNU General Public License as  *
 *   published by the Free Software Foundation;                            *
 *                                                                         *
 ***************************************************************************/

#include "bibtexhandler.h"
#include "../collections/bibtexcollection.h"
#include "../entry.h"
#include "../field.h"
#include "../collection.h"
#include "../document.h"
#include "../filehandler.h"
#include "../latin1literal.h"
#include "../tellico_debug.h"

#include <kstandarddirs.h>
#include <kurl.h>
#include <kstringhandler.h>
#include <tdelocale.h>

#include <tqstring.h>
#include <tqstringlist.h>
#include <tqregexp.h>
#include <tqdom.h>

// don't add braces around capital letters by default
#define TELLICO_BIBTEX_BRACES 0

using Tellico::BibtexHandler;

BibtexHandler::StringListMap* BibtexHandler::s_utf8LatexMap = 0;
BibtexHandler::QuoteStyle BibtexHandler::s_quoteStyle = BibtexHandler::BRACES;
const TQRegExp BibtexHandler::s_badKeyChars(TQString::fromLatin1("[^0-9a-zA-Z-]"));

TQStringList BibtexHandler::bibtexKeys(const Data::EntryVec& entries_) {
  TQStringList keys;
  for(Data::EntryVec::ConstIterator it = entries_.begin(); it != entries_.end(); ++it) {
    TQString s = bibtexKey(it.data());
    if(!s.isEmpty()) {
      keys << s;
    }
  }
  return keys;
}

TQString BibtexHandler::bibtexKey(Data::ConstEntryPtr entry_) {
  if(!entry_ || !entry_->collection() || entry_->collection()->type() != Data::Collection::Bibtex) {
    return TQString();
  }

  const Data::BibtexCollection* c = static_cast<const Data::BibtexCollection*>(entry_->collection().data());
  Data::FieldPtr f = c->fieldByBibtexName(TQString::fromLatin1("key"));
  if(f) {
    TQString key = entry_->field(f->name());
    if(!key.isEmpty()) {
      return key;
    }
  }

  TQString author;
  Data::FieldPtr authorField = c->fieldByBibtexName(TQString::fromLatin1("author"));
  if(authorField) {
    if(authorField->flags() & Data::Field::AllowMultiple) {
      // grab first author only;
      TQString tmp = entry_->field(authorField->name());
      author = tmp.section(';', 0, 0);
    } else {
      author = entry_->field(authorField->name());
    }
  }

  Data::FieldPtr titleField = c->fieldByBibtexName(TQString::fromLatin1("title"));
  TQString title;
  if(titleField) {
    title = entry_->field(titleField->name());
  }

  Data::FieldPtr yearField = c->fieldByBibtexName(TQString::fromLatin1("year"));
  TQString year;
  if(yearField) {
    year = entry_->field(yearField->name());
  }
  if(year.isEmpty()) {
    year = entry_->field(TQString::fromLatin1("pub_year"));
    if(year.isEmpty()) {
      year = entry_->field(TQString::fromLatin1("cr_year"));
    }
  }
  year = year.section(';', 0, 0);

  return bibtexKey(author, title, year);
}

TQString BibtexHandler::bibtexKey(const TQString& author_, const TQString& title_, const TQString& year_) {
  TQString key;
  // if no comma, take the last word
  if(!author_.isEmpty()) {
    if(author_.find(',') == -1) {
      key += author_.section(' ', -1).lower() + '-';
    } else {
      // if there is a comma, take the string up to the first comma
      key += author_.section(',', 0, 0).lower() + '-';
    }
  }
  TQStringList words = TQStringList::split(' ', title_);
  for(TQStringList::ConstIterator it = words.begin(); it != words.end(); ++it) {
    key += (*it).left(1).lower();
  }
  key += year_;
  // bibtex key may only contain [0-9a-zA-Z-]
  return key.replace(s_badKeyChars, TQString());
}

void BibtexHandler::loadTranslationMaps() {
  TQString mapfile = locate("appdata", TQString::fromLatin1("bibtex-translation.xml"));
  if(mapfile.isEmpty()) {
    return;
  }

  s_utf8LatexMap = new StringListMap();

  KURL u;
  u.setPath(mapfile);
  // no namespace processing
  TQDomDocument dom = FileHandler::readXMLFile(u, false);

  TQDomNodeList keyList = dom.elementsByTagName(TQString::fromLatin1("key"));

  for(unsigned i = 0; i < keyList.count(); ++i) {
    TQDomNodeList strList = keyList.item(i).toElement().elementsByTagName(TQString::fromLatin1("string"));
    // the strList might have more than one node since there are multiple ways
    // to represent a character in LaTex.
    TQString s = keyList.item(i).toElement().attribute(TQString::fromLatin1("char"));
    for(unsigned j = 0; j < strList.count(); ++j) {
      (*s_utf8LatexMap)[s].append(strList.item(j).toElement().text());
//      kdDebug() << "BibtexHandler::loadTranslationMaps - "
//       << s << " = " << strList.item(j).toElement().text() << endl;
    }
  }
}

TQString BibtexHandler::importText(char* text_) {
  if(!s_utf8LatexMap) {
    loadTranslationMaps();
  }

  TQString str = TQString::fromUtf8(text_);
  for(StringListMap::Iterator it = s_utf8LatexMap->begin(); it != s_utf8LatexMap->end(); ++it) {
    for(TQStringList::Iterator sit = it.data().begin(); sit != it.data().end(); ++sit) {
      str.replace(*sit, it.key());
    }
  }

  // now replace capitalized letters, such as {X}
  // but since we don't want to turn "... X" into "... {X}" later when exporting
  // we need to lower-case any capitalized text after the first letter that is
  // NOT contained in braces

  TQRegExp rx(TQString::fromLatin1("\\{([A-Z]+)\\}"));
  rx.setMinimal(true);
  str.replace(rx, TQString::fromLatin1("\\1"));

  return str;
}

TQString BibtexHandler::exportText(const TQString& text_, const TQStringList& macros_) {
  if(!s_utf8LatexMap) {
    loadTranslationMaps();
  }

  TQChar lquote, rquote;
  switch(s_quoteStyle) {
    case BRACES:
      lquote = '{';
      rquote = '}';
      break;
    case QUOTES:
      lquote =  '"';
      rquote =  '"';
      break;
  }

  TQString text = text_;

  for(StringListMap::Iterator it = s_utf8LatexMap->begin(); it != s_utf8LatexMap->end(); ++it) {
    text.replace(it.key(), it.data()[0]);
  }

  if(macros_.isEmpty()) {
    return lquote + addBraces(text) + rquote;
  }

// Now, split the text by the character '#', and examine each token to see if it is in
// the macro list. If it is not, then add left-quote and right-quote around it. If it is, don't
// change it. Then, in case '#' occurs in a non-macro string, replace any occurrences of '}#{' with '#'

// list of new tokens
  TQStringList list;

// first, split the text
  TQStringList tokens = TQStringList::split('#', text, true);
  for(TQStringList::Iterator it = tokens.begin(); it != tokens.end(); ++it) {
    // check to see if token is a macro
    if(macros_.findIndex((*it).stripWhiteSpace()) == -1) {
      // the token is NOT a macro, add braces around whole words and also around capitals
      list << lquote + addBraces(*it) + rquote;
    } else {
      list << *it;
    }
  }

  const TQChar octo = '#';
  text = list.join(octo);
  text.replace(TQString(rquote)+octo+lquote, octo);

  return text;
}

bool BibtexHandler::setFieldValue(Data::EntryPtr entry_, const TQString& bibtexField_, const TQString& value_) {
  Data::BibtexCollection* c = static_cast<Data::BibtexCollection*>(entry_->collection().data());
  Data::FieldPtr field = c->fieldByBibtexName(bibtexField_);
  if(!field) {
    // it was the case that the default bibliography did not have a bibtex property for keywords
    // so a "keywords" field would get created in the imported collection
    // but the existing collection had a field "keyword" so the values would not get imported
    // here, check to see if the current collection has a field with the same bibtex name and
    // use it instead of creating a new one
    Data::BibtexCollection* existingColl = Data::Document::self()->collection()->type() == Data::Collection::Bibtex
                                         ? static_cast<Data::BibtexCollection*>(Data::Document::self()->collection().data())
                                         : 0;
    Data::FieldPtr existingField = existingColl ? existingColl->fieldByBibtexName(bibtexField_) : 0;
    if(existingField) {
      field = new Data::Field(*existingField);
    } else if(value_.length() < 100) {
      // arbitrarily say if the value has more than 100 chars, then it's a paragraph
      TQString vlower = value_.lower();
      // special case, try to detect URLs
      // In qt 3.1, TQString::startsWith() is always case-sensitive
      if(bibtexField_ == Latin1Literal("url")
         || vlower.startsWith(TQString::fromLatin1("http")) // may also be https
         || vlower.startsWith(TQString::fromLatin1("ftp:/"))
         || vlower.startsWith(TQString::fromLatin1("file:/"))
         || vlower.startsWith(TQString::fromLatin1("/"))) { // assume this indicates a local path
        myDebug() << "BibtexHandler::setFieldValue() - creating a URL field for " << bibtexField_ << endl;
        field = new Data::Field(bibtexField_, KStringHandler::capwords(bibtexField_), Data::Field::URL);
      } else {
        field = new Data::Field(bibtexField_, KStringHandler::capwords(bibtexField_), Data::Field::Line);
      }
      field->setCategory(i18n("Unknown"));
    } else {
      field = new Data::Field(bibtexField_, KStringHandler::capwords(bibtexField_), Data::Field::Para);
    }
    field->setProperty(TQString::fromLatin1("bibtex"), bibtexField_);
    c->addField(field);
  }
  // special case keywords, replace commas with semi-colons so they get separated
  TQString value = value_;
  if(field->property(TQString::fromLatin1("bibtex")).startsWith(TQString::fromLatin1("keyword"))) {
    value.replace(',', ';');
    // special case refbase bibtex export, with multiple keywords fields
    TQString oValue = entry_->field(field);
    if(!oValue.isEmpty()) {
      value = oValue + "; " + value;
    }
  }
  return entry_->setField(field, value);
}

TQString& BibtexHandler::cleanText(TQString& text_) {
  // FIXME: need to improve this for removing all Latex entities
//  TQRegExp rx(TQString::fromLatin1("(?=[^\\\\])\\\\.+\\{"));
  TQRegExp rx(TQString::fromLatin1("\\\\.+\\{"));
  rx.setMinimal(true);
  text_.replace(rx, TQString());
  text_.replace(TQRegExp(TQString::fromLatin1("[{}]")), TQString());
  text_.replace('~', ' ');
  return text_;
}

// add braces around capital letters
TQString& BibtexHandler::addBraces(TQString& text) {
#if !TELLICO_BIBTEX_BRACES
  return text;
#else
  int inside = 0;
  uint l = text.length();
  // start at first letter, but skip if only the first is capitalized
  for(uint i = 0; i < l; ++i) {
    const TQChar c = text.at(i);
    if(inside == 0 && c >= 'A' && c <= 'Z') {
      uint j = i+1;
      while(text.at(j) >= 'A' && text.at(j) <= 'Z' && j < l) {
        ++j;
      }
      if(i == 0 && j == 1) {
        continue; // no need to do anything to first letter
      }
      text.insert(i, '{');
      // now j should be incremented
      text.insert(j+1, '}');
      i = j+1;
      l += 2; // the length changed
    } else if(c == '{') {
      ++inside;
    } else if(c == '}') {
      --inside;
    }
  }
  return text;
#endif
}