libkdepim/qutf7codec.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550

/*
  qutf7codec.cpp

  A TQTextCodec for UTF-7 (rfc2152).
  Copyright (c) 2001 Marc Mutz <mutz@kde.org>
  See file COPYING for details

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License, version 2.0,
  as published by the Free Software Foundation.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  02110-1301, US

  As a special exception, permission is granted to use this plugin
  with any version of Qt by TrollTech AS, Norway. In this case, the
  use of this plugin doesn't cause the resulting executable to be
  covered by the GNU General Public License.
  This exception does not however invalidate any other reasons why the
  executable file might be covered by the GNU General Public License.
*/


#include "qutf7codec.h"

#ifndef QT_NO_TEXTCODEC

int QUtf7Codec::mibEnum() const {
  return 1012;
}

int QStrictUtf7Codec::mibEnum() const {
  return -1012;
}

const char* QUtf7Codec::name() const {
  return "UTF-7";
}

const char* QStrictUtf7Codec::name() const {
  return "X-QT-UTF-7-STRICT";
}

const char* QUtf7Codec::mimeName() const {
  return "UTF-7";
}

bool QUtf7Codec::canEncode( TQChar ) const {
  return TRUE;
}

bool QUtf7Codec::canEncode( const TQString & ) const {
  return TRUE;
}

static uchar base64Set[] = {
  0x00, 0x00, 0x00, 0x00, // '\0' ...
  0x00, 0x11, 0xFF, 0xC0, // ' ' ... '?'
  0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
  0x7F, 0xFF, 0xFF, 0xE0  // '`' ... DEL
};

static uchar base64SetWithLastTwoBitsZero[] = {
  0x00, 0x00, 0x00, 0x00, // '\0' ...
  0x00, 0x00, 0x88, 0x80, // ' ' ... '?'
  0x44, 0x44, 0x44, 0x40, // '@' ... '_'
  0x11, 0x11, 0x11, 0x00  // '`' ... DEL
};

static uchar directSet[] = {
  0x00, 0x00, 0x00, 0x00, // '\0' ...
  0x01, 0xCF, 0xFF, 0xE1, // ' ' ... '?'
  0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
  0x7F, 0xFF, 0xFF, 0xE0  // '`' ... DEL
};

static uchar optDirectSet[] = {
  0x00, 0x00, 0x00, 0x00, // '\0' ...
  0x7E, 0x20, 0x00, 0x1E, // ' ' ... '?'
  0x80, 0x00, 0x00, 0x17, // '@' ... '_'
  0x80, 0x00, 0x00, 0x1C  // '`' ... DEL
};

static inline bool isOfSet(uchar ch, uchar* set) {
  return set[ ch/8 ] & (0x80 >> ( ch%8 ));
}

int QUtf7Codec::heuristicContentMatch(const char* chars, int len) const
{
  int stepNo = 0;
  int i;
  bool shifted = FALSE;
  bool rightAfterEscape = FALSE;
  bool onlyNullBitsSinceLastBoundary = TRUE;
  for ( i = 0; i < len ; i++ ) {
    if ((unsigned char)chars[i] >= 128) // 8bit chars not allowed.
      break;
    if (shifted) {
      if ( isOfSet(chars[i],base64Set) ) {
	switch (stepNo) {
	case 0:
	  onlyNullBitsSinceLastBoundary = TRUE;
	  break;
	case 3:
	  onlyNullBitsSinceLastBoundary
	    = isOfSet(chars[i],base64SetWithLastTwoBitsZero);
	  break;
	case 6:
	  onlyNullBitsSinceLastBoundary
	    = ( chars[i] == 'A' || chars[i] == 'Q' ||
		chars[i] == 'g' || chars[i] == 'w' );
	  break;
	default:
	   onlyNullBitsSinceLastBoundary
	     = onlyNullBitsSinceLastBoundary && (chars[i] == 'A');
	}
	stepNo = (stepNo + 1) % 8;
	rightAfterEscape = FALSE;
      } else {
	if (rightAfterEscape && chars[i] != '-')
	  break; // a '+' must be followed by '-' or a base64 char
	if (!onlyNullBitsSinceLastBoundary)
	  break; // non-zero bits in the tail of the base64 encoding
	shifted = FALSE;
	stepNo = 0;
      }
    } else {
      if (chars[i] == '+') {
	shifted = TRUE;
	rightAfterEscape = TRUE;
      }
    }
  }
  return i;
}

class QUtf7Decoder : public TQTextDecoder {
  // the storage for our unicode char until it's finished
  ushort uc;
  // the state of the base64 decoding
  // can be 0 (just finished three unicode chars)
  //        1 (have the upper  6 bits of uc already)
  //        2 (have the upper 12 bits of uc already)
  //        3 (have the upper  2 bits of uc already)
  // ..........
  //        7 (have the upper 10 bits of uc already)
  //   =>   n (have the upper (n * 6) % 16 bits of uc already)
  // "stepNo" cycles through all it's values every three
  // unicode chars.
  char stepNo;
  // remembers if we are in shifted-sequence mode
  bool shifted;
  // remembers if we're just after the initial '+'
  // of a shifted-sequence.
  bool rightAfterEscape;
public:
  QUtf7Decoder() : uc(0), stepNo(0), shifted(FALSE), rightAfterEscape(FALSE)
  {
  }

private:
  inline void resetParser()
  {
    uc = 0;
    stepNo = 0;
    shifted = FALSE;
    rightAfterEscape = FALSE;
  }

public:
  TQString toUnicode(const char* chars, int len)
  {
    TQString result = "";
    for (int i=0; i<len; i++) {
      uchar ch = chars[i];

      //
      // check for 8bit char's:
      // 
      if ( ch > 127 ) {
	qWarning("QUtf7Decoder: 8bit char found in input. "
		 "Parser has been re-initialized!");
	resetParser();
	result += TQChar::replacement;
	continue;
      }

      if (shifted) { // in shifted mode

	//
	// first, we check specialities that only occur
	// right after the escaping '+':
	//
	if ( rightAfterEscape && ch == '-' ) {
	  // a "+-" sequence is a short-circuit encoding
	  // for just '+':
	  resetParser();
	  result += TQChar('+');
	  // we're already done for this "ch", so
	  continue;
	}

	//
	// Here we're going to extract the bits represented by "ch":
	//
	ushort bits;
	if ( ch >= 'A' && ch <= 'Z' ) {
	  bits = ch - 'A';
	} else if ( ch >= 'a' && ch <= 'z' ) {
	  bits = ch - 'a' + 26;
	} else if ( ch >= '0' && ch <= '9' ) {
	  bits = ch - '0' + 52;
	} else if ( ch == '+' ) {
	  bits = 62;
	} else if ( ch == '/' ) {
	  bits = 63;
	} else {
	  bits = 0; // keep compiler happy

	  //
	  // ch is not of the base64 alphabet.
	  // Here we are going to check the sequence's validity:
	  //
	  if ( rightAfterEscape ) {
	    // any non-base64 char following an escaping '+'
	    // makes for an ill-formed sequence.
	    // Note that we catch (the valid) "+-" pair
	    // right at the beginning.
	    qWarning("QUtf7Decoder: ill-formed input: "
		     "non-base64 char after escaping \"+\"!");
	  }
	  // pending bits from base64 encoding must be all 0:
	  if (stepNo >= 1 && uc) {
	    qWarning("QUtf7Decoder: ill-formed sequence: "
		     "non-zero bits in shifted-sequence tail!");
	  }
	  resetParser();

	  // a '-' signifies the end of the shifted-sequence,
	  // so we just swallow it.
	  if ( ch == '-' )
	    continue;
	  // end of validity checking. Process ch now...
	}

	if ( /*still*/ shifted ) {
	  //
	  // now we're going to stuff the "bits" bit bucket into
	  // the right position inside "uc", emitting a resulting
	  // TQChar if possible.
	  //
	  switch (stepNo) {
	    // "bits" are the 6 msb's of uc
	  case 0: uc = bits << 10; break;

	  case 1: uc |= bits << 4; break;

	    // 4 bits of "bits" complete the first ushort
	  case 2: uc |= bits >> 2; result += TQChar(uc);
	    // 2 bits of "bits" make the msb's of the next ushort
	          uc = bits << 14; break;
	  case 3: uc |= bits << 8; break;
	  case 4: uc |= bits << 2; break;

	    // 2 bits of "bits" complete the second ushort
	  case 5: uc |= bits >> 4; result += TQChar(uc);
	    // 4 bits of "bits" make the msb's of the next ushort
	          uc = bits << 12; break;
	  case 6: uc |= bits << 6; break;

	    // these 6 bits complete the third ushort
	    // and also one round of 8 chars -> 3 ushort decoding
	  case 7: uc |= bits;      result += TQChar(uc);
	          uc = 0;          break;
	  default: ;
	  } // switch (stepNo)
	  // increase the step counter
	  stepNo++;
	  stepNo %= 8;
	  rightAfterEscape = FALSE;
	  // and look at the next char.
	  continue;
	} // fi (still) shifted
      } // fi shifted

      //
      // if control reaches here, we either weren't in a
      // shifted sequence or we just left one by seeing
      // a non-base64-char.
      // Either way, we have to process "ch" outside
      // a shifted-sequence now:
      //
      if ( ch == '+' ) {
	// '+' is the escape char for entering a
	// shifted sequence:
	shifted = TRUE;
	stepNo = 0;
	// also, we're right at the beginning where
	// special rules apply:
	rightAfterEscape = TRUE;
      } else {
	// US-ASCII values are directly used
	result += TQChar(ch);
      }
    }

    return result;

  } // toUnicode()

}; // class QUtf7Decoder

TQTextDecoder* QUtf7Codec::makeDecoder() const
{
  return new QUtf7Decoder;
}


class QUtf7Encoder : public TQTextEncoder {
  uchar dontNeedEncodingSet[16];
  ushort outbits;
  uint stepNo : 2;
  bool shifted : 1;
  bool mayContinueShiftedSequence : 1;
public:
  QUtf7Encoder(bool encOpt, bool encLwsp)
    : outbits(0), stepNo(0),
      shifted(FALSE), mayContinueShiftedSequence(FALSE)
  {
    for ( int i = 0; i < 16 ; i++) {
      dontNeedEncodingSet[i] = directSet[i];
      if (!encOpt)
	dontNeedEncodingSet[i] |= optDirectSet[i];
    }
    if(!encLwsp) {
      dontNeedEncodingSet[' '/8] |= 0x80 >> (' '%8);
      dontNeedEncodingSet['\n'/8] |= 0x80 >> ('\n'%8);
      dontNeedEncodingSet['\r'/8] |= 0x80 >> ('\r'%8);
      dontNeedEncodingSet['\t'/8] |= 0x80 >> ('\t'%8);
    }
  }

private:

  char toBase64( ushort u ) {
    if ( u < 26 )
      return (char)u + 'A';
    else if ( u < 52 )
      return (char)u - 26 + 'a';
    else if ( u < 62 )
      return (char)u - 52 + '0';
    else if ( u == 62 )
      return '+';
    else
      return '/';
  }

  void addToShiftedSequence(TQCString::Iterator & t, ushort u) {
    switch (stepNo) {
      // no outbits; use uppermost 6 bits of u
    case 0:
      *t++ = toBase64( u >> 10 );
      *t++ = toBase64( (u & 0x03FF /* umask top 6 bits */ ) >> 4 );
      // save 4 lowest-order bits in outbits[5..2]
      outbits = (u & 0x000F) << 2;
      break;

      // outbits available; use top two bits of u to complete
      // the previous char
    case 1:
      if (!mayContinueShiftedSequence) {
	// if mayContinue, this char has already been written
	*t++ = toBase64( outbits | ( u >> 14 ) );
      }
      *t++ = toBase64( (u & 0x3F00 /* mask top 2 bits */ ) >> 8 );
      *t++ = toBase64( (u & 0x00FC /* mask msbyte */ ) >> 2 );
      // save 2 lowest-significant bits in outbits[5..4]
      outbits = (u & 0x0003) << 4;
      break;

      // outbits available; use top four bits of u to complete
      // the previous char
    case 2:
      if (!mayContinueShiftedSequence) {
	// if mayContinue, this char has already been written
	*t++ = toBase64( outbits | ( u >> 12 ) );
      }
      *t++ = toBase64( (u & 0x0FFF) >> 6 );
      *t++ = toBase64( u & 0x003F );
      break;

    default: ;
    }
    stepNo = (stepNo + 1) % 3;
  }

  void endShiftedSequence(TQCString::Iterator & t) {
    switch (stepNo) {
    case 1: // four outbits still to be written
    case 2: // two outbits still to be written
      *t++ = toBase64( outbits );
      break;
    case 0:      // nothing to do
    default: ;
    }
    outbits = 0;
  }

  // depending on the stepNo, checks whether we can continue
  // an already ended shifted-sequence with char "u".
  // This is only possible if the topmost bits fit the
  // already written ones (which are all 0 between calls)
  bool continueOK( ushort u ) {
    return stepNo == 0 ||
      ( stepNo == 1 && (u & 0xF000) == 0 ) ||
      ( stepNo == 2 && (u & 0xC000) == 0 );
  }

  void processDoesntNeedEncoding(TQCString::Iterator & t, ushort ch) {
    // doesn't need encoding
    if (shifted) {
      endShiftedSequence(t);
      // add "lead-out" to dis-ambiguate following chars:
      if (isOfSet((char)ch,base64Set) || ch == '-' ) {
	*t++ = '-';
      }
    } else if (mayContinueShiftedSequence) {
      // if mayContinue is set, this means the
      // shifted-sequence needs a lead-out.
      mayContinueShiftedSequence = FALSE;
      if (isOfSet(ch,base64Set) || ch == '-' ) {
	*t++ = '-';
      }
    }
    *t++ = (uchar)ch;
    shifted = FALSE;
    stepNo = 0;
  }

public:
  TQCString fromUnicode(const TQString & uc, int & len_in_out)
  {
    // allocate place for worst case:
    //   len/2 * (5+1) for an alternating sequence of e.g. "A\",
    // + 4             for a worst-case of another +ABC encoded char
    // + 1             for the trailing \0
    // 
    int maxreslen = 3 * len_in_out + 5;
    TQCString result( maxreslen );

#if 0
    //    if (len_in_out == 1) {
    cout << "\nlen_in_out: " << len_in_out
	 <<"; shifted: " << (shifted ? "true" : "false")
	 << ";\n" << "mayContinue: "
	 << (mayContinueShiftedSequence ? "true" : "false")
	 << "; stepNo: " << stepNo << ";\n"
	 << "outbits: " << outbits << endl;
      //    }
#endif

    // source and destination cursor
    const TQChar * s = uc.unicode();
    TQCString::Iterator t = result.data();

    if ( uc.isNull() ) {
      // return to ascii requested:
      if ( mayContinueShiftedSequence )
	*t++ = '-';
    } else {
      // normal operation:
      for (int i = 0 ; i < len_in_out ;
	   i++/*, checkOutBuf(result,maxreslen,t,i,len_in_out,5)*/ ) {
	ushort ch = s[i].unicode();
	
	//
	// first, we check whether we might get around encoding:
	//
	if ( ch < 128 ) {
	  //
	  // ch is usAscii, so we have a chance that we don't
	  // need to encode it.
	  //
	  if ( isOfSet((uchar)ch,dontNeedEncodingSet) ) {
	    processDoesntNeedEncoding(t,ch);
	    continue;
	  } else if ( ch == '+' ) {
	    // '+' is the shift escape character
	    if (shifted || mayContinueShiftedSequence) {
	      // if we are already in shifted mode, we just
	      // encode the '+', too. Compare
	      // 24bits ("-+-") + some from ending the shifted-sequence
	      // with 21,33 bits
	      addToShiftedSequence(t,ch);
	      mayContinueShiftedSequence = FALSE;
	      shifted = TRUE;
	    } else {
	      // shortcut encoding of '+':
	      *t++ = '+';
	      *t++ = '-';
	    }
	    continue; // done
	  } // else fall through to encoding
	}
	//
	// need encoding
	//
	if (!shifted && (!mayContinueShiftedSequence || !continueOK(ch) ) ) {
	  *t++ = '+';
	  stepNo = 0;
	}
	addToShiftedSequence(t,ch);
	shifted = TRUE;
	mayContinueShiftedSequence = FALSE;
      }

      if ( shifted ) {
	endShiftedSequence(t);
	mayContinueShiftedSequence = TRUE;
      };
      shifted = FALSE;
    }

    *t = '\0';
    len_in_out = t - result.data();

#if 0
    cout << "len_in_out: " << len_in_out << "; "
	 << "mayContinue: " << (mayContinueShiftedSequence ? "true" : "false")
	 << "; stepNo: " << stepNo << endl;
#endif

    Q_ASSERT(len_in_out <= maxreslen-1);

    return result;
  } // fromUnicode()

}; // class QUtf7Encoder

TQTextEncoder* QUtf7Codec::makeEncoder() const {
  return new QUtf7Encoder( false, false );
}

TQTextEncoder* QStrictUtf7Codec::makeEncoder() const {
  return new QUtf7Encoder( true, false );
}

#endif // QT_NO_TEXTCODEC