• Skip to content
  • Skip to link menu
KDE 4.1 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

kcharsets.cpp

Go to the documentation of this file.
00001 /* This file is part of the KDE libraries
00002     Copyright (C) 1999 Lars Knoll (knoll@kde.org)
00003     Copyright (C) 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
00004     Copyright (C) 2007 Nick Shaforostoff <shafff@ukr.net>
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019     Boston, MA 02110-1301, USA.
00020 */
00021 #include "kcharsets.h"
00022 
00023 #include "kfilterdev.h"
00024 #include "kentities.c"
00025 
00026 #include "kconfig.h"
00027 #include "kdebug.h"
00028 #include "kglobal.h"
00029 #include "klocale.h"
00030 
00031 #include <QtCore/QDir>
00032 #include <QtCore/QRegExp>
00033 #include <QtCore/QCharRef>
00034 #include <QtCore/QMutableStringListIterator>
00035 #include <QtCore/QTextCodec>
00036 
00037 #include <assert.h>
00038 #include <QHash>
00039 
00040 /*
00041  * ### FIXME KDE4: the name of the encodings should mostly be uppercase
00042  * The names of this list are user-visible
00043  * Generate with generate_string_table.pl, input data:
00044 ISO 8859-1
00045 i18n:Western European
00046 ISO 8859-15
00047 i18n:Western European
00048 ISO 8859-14
00049 i18n:Western European
00050 cp 1252
00051 i18n:Western European
00052 IBM850
00053 i18n:Western European
00054 ISO 8859-2
00055 i18n:Central European
00056 ISO 8859-3
00057 i18n:Central European
00058 ISO 8859-4
00059 i18n:Baltic
00060 ISO 8859-13
00061 i18n:Baltic
00062 ISO 8859-16
00063 i18n:South-Eastern Europe
00064 cp 1250
00065 i18n:Central European
00066 cp 1254
00067 i18n:Turkish
00068 cp 1257
00069 i18n:Baltic
00070 KOI8-R
00071 i18n:Cyrillic
00072 ISO 8859-5
00073 i18n:Cyrillic
00074 cp 1251
00075 i18n:Cyrillic
00076 KOI8-U
00077 i18n:Cyrillic
00078 IBM866
00079 i18n:Cyrillic
00080 Big5
00081 i18n:Chinese Traditional
00082 Big5-HKSCS
00083 i18n:Chinese Traditional
00084 GB18030
00085 i18n:Chinese Simplified
00086 GBK
00087 i18n:Chinese Simplified
00088 GB2312
00089 i18n:Chinese Simplified
00090 EUC-KR
00091 i18n:Korean
00092 sjis
00093 i18n:Japanese
00094 jis7
00095 i18n:Japanese
00096 EUC-JP
00097 i18n:Japanese
00098 ISO 8859-7
00099 i18n:Greek
00100 cp 1253
00101 i18n:Greek
00102 ISO 8859-6
00103 i18n:Arabic
00104 cp 1256
00105 i18n:Arabic
00106 ISO 8859-8
00107 i18n:Hebrew
00108 ISO 8859-8-I
00109 i18n:Hebrew
00110 cp 1255
00111 i18n:Hebrew
00112 ISO 8859-9
00113 i18n:Turkish
00114 TIS620
00115 i18n:Thai
00116 ISO 8859-11
00117 i18n:Thai
00118 UTF-8
00119 i18n:Unicode
00120 UTF-16
00121 i18n:Unicode
00122 utf7
00123 i18n:Unicode
00124 ucs2
00125 i18n:Unicode
00126 ISO 10646-UCS-2
00127 i18n:Unicode
00128 winsami2
00129 i18n:Northern Saami
00130 windows-1258
00131 i18n:Other
00132 IBM874
00133 i18n:Other
00134 TSCII
00135 i18n:Other
00136  */
00137 /*
00138  * Notes about the table:
00139  *
00140  * - The following entries were disabled and removed from the table:
00141 ibm852
00142 i18n:Central European
00143 pt 154
00144 i18n:Cyrillic              // ### TODO "PT 154" seems to have been removed from Qt
00145  *
00146  * - ISO 8559-11 is the deprecated name of TIS-620
00147  * - utf7 is not in Qt
00148  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
00149  * - windows-1258: TODO
00150  * - IBM874: TODO
00151  * - TSCII: TODO
00152  */
00153 static const char language_for_encoding_string[] =
00154     "ISO 8859-1\0"
00155     I18N_NOOP2("@item Text character set", "Western European")"\0"
00156     "ISO 8859-15\0"
00157     "ISO 8859-14\0"
00158     "cp 1252\0"
00159     "IBM850\0"
00160     "ISO 8859-2\0"
00161     I18N_NOOP2("@item Text character set", "Central European")"\0"
00162     "ISO 8859-3\0"
00163     "ISO 8859-4\0"
00164     I18N_NOOP2("@item Text character set", "Baltic")"\0"
00165     "ISO 8859-13\0"
00166     "ISO 8859-16\0"
00167     I18N_NOOP2("@item Text character set", "South-Eastern Europe")"\0"
00168     "cp 1250\0"
00169     "cp 1254\0"
00170     I18N_NOOP2("@item Text character set", "Turkish")"\0"
00171     "cp 1257\0"
00172     "KOI8-R\0"
00173     I18N_NOOP2("@item Text character set", "Cyrillic")"\0"
00174     "ISO 8859-5\0"
00175     "cp 1251\0"
00176     "KOI8-U\0"
00177     "IBM866\0"
00178     "Big5\0"
00179     I18N_NOOP2("@item Text character set", "Chinese Traditional")"\0"
00180     "Big5-HKSCS\0"
00181     "GB18030\0"
00182     I18N_NOOP2("@item Text character set", "Chinese Simplified")"\0"
00183     "GBK\0"
00184     "GB2312\0"
00185     "EUC-KR\0"
00186     I18N_NOOP2("@item Text character set", "Korean")"\0"
00187     "sjis\0"
00188     I18N_NOOP2("@item Text character set", "Japanese")"\0"
00189     "jis7\0"
00190     "EUC-JP\0"
00191     "ISO 8859-7\0"
00192     I18N_NOOP2("@item Text character set", "Greek")"\0"
00193     "cp 1253\0"
00194     "ISO 8859-6\0"
00195     I18N_NOOP2("@item Text character set", "Arabic")"\0"
00196     "cp 1256\0"
00197     "ISO 8859-8\0"
00198     I18N_NOOP2("@item Text character set", "Hebrew")"\0"
00199     "ISO 8859-8-I\0"
00200     "cp 1255\0"
00201     "ISO 8859-9\0"
00202     "TIS620\0"
00203     I18N_NOOP2("@item Text character set", "Thai")"\0"
00204     "ISO 8859-11\0"
00205     "UTF-8\0"
00206     I18N_NOOP2("@item Text character set", "Unicode")"\0"
00207     "UTF-16\0"
00208     "utf7\0"
00209     "ucs2\0"
00210     "ISO 10646-UCS-2\0"
00211     "winsami2\0"
00212     I18N_NOOP2("@item Text character set", "Northern Saami")"\0"
00213     "windows-1258\0"
00214     I18N_NOOP2("@item Text character set", "Other")"\0"
00215     "IBM874\0"
00216     "TSCII\0"
00217     "\0";
00218 
00219 static const int language_for_encoding_indices[] = {
00220        0,   11,   28,   11,   40,   11,   52,   11,
00221       60,   11,   67,   78,   95,   78,  106,  117,
00222      124,  117,  136,  148,  169,   78,  177,  185,
00223      193,  117,  201,  208,  217,  208,  228,  208,
00224      236,  208,  243,  208,  250,  255,  275,  255,
00225      286,  294,  313,  294,  317,  294,  324,  331,
00226      338,  343,  352,  343,  357,  343,  364,  375,
00227      381,  375,  389,  400,  407,  400,  415,  426,
00228      433,  426,  446,  426,  454,  185,  465,  472,
00229      477,  472,  489,  495,  503,  495,  510,  495,
00230      515,  495,  520,  495,  536,  545,  560,  573,
00231      579,  573,  586,  573,   -1
00232 };
00233 
00234 /*
00235  * defines some different names for codecs that are built into Qt.
00236  * The names in this list must be lower-case.
00237  * input data for generate_string_table.pl:
00238 iso-ir-111
00239 koi8-r
00240 koi unified
00241 koi8-r
00242 us-ascii
00243 iso 8859-1
00244 usascii
00245 iso 8859-1
00246 ascii
00247 iso 8859-1
00248 unicode-1-1-utf-7
00249 utf-7
00250 ucs2
00251 iso-10646-ucs-2
00252 iso10646-1
00253 iso-10646-ucs-2
00254 gb18030.2000-1
00255 gb18030
00256 gb18030.2000-0
00257 gb18030
00258 gbk-0
00259 gbk
00260 gb2312
00261 gbk
00262 gb2312.1980-0
00263 gbk
00264 big5-0
00265 big5
00266 euc-kr
00267 euckr
00268 euc-jp
00269 eucjp
00270 jisx0201.1976-0
00271 eucjp
00272 jisx0208.1983-0
00273 eucjp
00274 jisx0208.1990-0
00275 eucjp
00276 jisx0208.1997-0
00277 eucjp
00278 jisx0212.1990-0
00279 eucjp
00280 jisx0213.2000-1
00281 eucjp
00282 jisx0213.2000-2
00283 eucjp
00284 shift_jis
00285 sjis
00286 shift-jis
00287 sjis
00288 sjis
00289 sjis
00290 iso-2022-jp
00291 jis7
00292 windows850
00293 ibm850
00294 windows866
00295 ibm866
00296 windows-850
00297 ibm850
00298 windows-866
00299 ibm866
00300 cp-10000
00301 apple roman
00302 thai-tis620
00303 iso 8859-11
00304 windows-874
00305 ibm874
00306 windows874
00307 ibm874
00308 cp-874
00309 ibm874
00310 ksc5601.1987-0
00311 euckr
00312 ks_c_5601-1987
00313 euckr
00314 mac-roman
00315 apple roman
00316 macintosh
00317 apple roman
00318 mac
00319 apple roman
00320 csiso2022jp
00321 iso-2022-jp
00322 */
00323 /*
00324  * Notes about the table:
00325  * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
00326  * - utf7 is not in Qt
00327  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
00328  * - sjis: appears on the table for x-sjis
00329  * - jis7: ISO-2022-JP is now the default name in Qt4
00330  * - cp-874: is it really needed?
00331  * - mac-roman: appears on the table for x-mac-roman
00332  * - csiso2022jp: See bug #77243
00333  */
00334 static const char builtin_string[] =
00335     "iso-ir-111\0"
00336     "koi8-r\0"
00337     "koi unified\0"
00338     "us-ascii\0"
00339     "iso 8859-1\0"
00340     "usascii\0"
00341     "ascii\0"
00342     "unicode-1-1-utf-7\0"
00343     "utf-7\0"
00344     "ucs2\0"
00345     "iso-10646-ucs-2\0"
00346     "iso10646-1\0"
00347     "gb18030.2000-1\0"
00348     "gb18030\0"
00349     "gb18030.2000-0\0"
00350     "gbk-0\0"
00351     "gbk\0"
00352     "gb2312\0"
00353     "gb2312.1980-0\0"
00354     "big5-0\0"
00355     "big5\0"
00356     "euc-kr\0"
00357     "euckr\0"
00358     "euc-jp\0"
00359     "eucjp\0"
00360     "jisx0201.1976-0\0"
00361     "jisx0208.1983-0\0"
00362     "jisx0208.1990-0\0"
00363     "jisx0208.1997-0\0"
00364     "jisx0212.1990-0\0"
00365     "jisx0213.2000-1\0"
00366     "jisx0213.2000-2\0"
00367     "shift_jis\0"
00368     "sjis\0"
00369     "shift-jis\0"
00370     "iso-2022-jp\0"
00371     "jis7\0"
00372     "windows850\0"
00373     "ibm850\0"
00374     "windows866\0"
00375     "ibm866\0"
00376     "windows-850\0"
00377     "windows-866\0"
00378     "cp-10000\0"
00379     "apple roman\0"
00380     "thai-tis620\0"
00381     "iso 8859-11\0"
00382     "windows-874\0"
00383     "ibm874\0"
00384     "windows874\0"
00385     "cp-874\0"
00386     "ksc5601.1987-0\0"
00387     "ks_c_5601-1987\0"
00388     "mac-roman\0"
00389     "macintosh\0"
00390     "mac\0"
00391     "csiso2022jp\0"
00392     "\0";
00393 
00394 static const int builtin_indices[] = {
00395        0,   11,   18,   11,   30,   39,   50,   39,
00396       58,   39,   64,   82,   88,   93,  109,   93,
00397      120,  135,  143,  135,  158,  164,  168,  164,
00398      175,  164,  189,  196,  201,  208,  214,  221,
00399      227,  221,  243,  221,  259,  221,  275,  221,
00400      291,  221,  307,  221,  323,  221,  339,  349,
00401      354,  349,  349,  349,  364,  376,  381,  392,
00402      399,  410,  417,  392,  429,  410,  441,  450,
00403      462,  474,  486,  498,  505,  498,  516,  498,
00404      523,  208,  538,  208,  553,  450,  563,  450,
00405      573,  450,  577,  364,   -1
00406 };
00407 
00408 #if 0
00409 // some different names for the encodings defined in the charmaps files.
00410 // even though the charmap file names are all uppercase, the names are all lowercase here.
00411 /* input data for generate_string_table.pl:
00412 cp852
00413 ibm852
00414 cp-852
00415 ibm852
00416 x-cp-852
00417 ibm852
00418 windows852
00419 ibm852
00420 windows-852
00421 ibm852
00422 x-windows-852
00423 ibm852
00424  */
00425 static const char aliases_string[] =
00426     "cp852\0"
00427     "ibm852\0"
00428     "cp-852\0"
00429     "x-cp-852\0"
00430     "windows852\0"
00431     "windows-852\0"
00432     "x-windows-852\0"
00433     "\0";
00434 
00435 static const int aliases_indices[] = {
00436        0,    6,   13,    6,   20,    6,   29,    6,
00437       40,    6,   52,    6,   -1
00438 };
00439 #endif
00440 
00441 /*
00442  * some last resort hints in case the charmap file couldn't be found.
00443  * This gives at least a partial conversion and helps making things readable.
00444  *
00445  * the name used as input here is already converted to the more canonical
00446  * name as defined in the aliases array.
00447  *
00448  * Input data:
00449 cp1250
00450 iso-8859-2
00451 koi8-r
00452 iso-8859-5
00453 koi8-u
00454 koi8-r
00455 pt 154
00456 windows-1251
00457 paratype-154
00458 windows-1251
00459 pt-154
00460 windows-1251
00461  */
00462 /* Notes:
00463  * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
00464  */
00465 static const char conversion_hints_string[] =
00466     "cp1250\0"
00467     "iso-8859-2\0"
00468     "koi8-r\0"
00469     "iso-8859-5\0"
00470     "koi8-u\0"
00471     "pt 154\0"
00472     "windows-1251\0"
00473     "paratype-154\0"
00474     "pt-154\0"
00475     "\0";
00476 
00477 static const int conversion_hints_indices[] = {
00478        0,    7,   18,   25,   36,   18,   43,   50,
00479       63,   50,   76,   50,   -1
00480 };
00481 
00482 // search an array of items index/data, find first matching index
00483 // and return data, or return 0
00484 static inline
00485 const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
00486 {
00487     for (int i = 0; indices[i] != -1; i += 2)
00488         if (qstrcmp(start + indices[i], entry) == 0)
00489             return start + indices[i + 1];
00490     return 0;
00491 }
00492 
00493 
00494 class KCharsetsPrivate
00495 {
00496 public:
00497     KCharsetsPrivate(KCharsets* _kc)
00498     {
00499         kc = _kc;
00500         codecForNameDict.reserve( 43 );
00501     }
00502     // Hash for the encoding names (sensitive case)
00503     QHash<QByteArray,QTextCodec*> codecForNameDict;
00504     KCharsets* kc;
00505 
00506     //Cache list so QStrings can be implicitly shared
00507     QList<QStringList> encodingsByScript;
00508 };
00509 
00510 // --------------------------------------------------------------------------
00511 
00512 KCharsets::KCharsets()
00513     :d(new KCharsetsPrivate(this))
00514 {
00515 }
00516 
00517 KCharsets::~KCharsets()
00518 {
00519     delete d;
00520 }
00521 
00522 QChar KCharsets::fromEntity(const QString &str)
00523 {
00524     QChar res = QChar::Null;
00525 
00526     if ( str.isEmpty() )
00527         return QChar::Null;
00528 
00529     int pos = 0;
00530     if(str[pos] == '&') pos++;
00531 
00532     // Check for '&#000' or '&#x0000' sequence
00533     if (str[pos] == '#' && str.length()-pos > 1) {
00534         bool ok;
00535         pos++;
00536         if (str[pos] == 'x' || str[pos] == 'X') {
00537             pos++;
00538             // '&#x0000', hexadecimal character reference
00539             const QString tmp( str.mid( pos ) );
00540             res = tmp.toInt(&ok, 16);
00541         } else {
00542             //  '&#0000', decimal character reference
00543             const QString tmp( str.mid( pos ) );
00544             res = tmp.toInt(&ok, 10);
00545         }
00546         if ( ok )
00547             return res;
00548         else
00549             return QChar::Null;
00550     }
00551 
00552     const QByteArray raw ( str.toLatin1() );
00553     const entity *e = kde_findEntity( raw, raw.length() );
00554 
00555     if(!e)
00556     {
00557         //kDebug( 0 ) << "unknown entity " << str <<", len = " << str.length();
00558         return QChar::Null;
00559     }
00560     //kDebug() << "got entity " << str << " = " << e->code;
00561 
00562     return QChar(e->code);
00563 }
00564 
00565 QChar KCharsets::fromEntity(const QString &str, int &len)
00566 {
00567     // entities are never longer than 8 chars... we start from
00568     // that length and work backwards...
00569     len = 8;
00570     while(len > 0)
00571     {
00572         QString tmp = str.left(len);
00573         QChar res = fromEntity(tmp);
00574         if( res != QChar::Null ) return res;
00575         len--;
00576     }
00577     return QChar::Null;
00578 }
00579 
00580 
00581 QString KCharsets::toEntity(const QChar &ch)
00582 {
00583     QString ent;
00584     ent.sprintf("&#0x%x;", ch.unicode());
00585     return ent;
00586 }
00587 
00588 QString KCharsets::resolveEntities( const QString &input )
00589 {
00590     QString text = input;
00591     const QChar *p = text.unicode();
00592     const QChar *end = p + text.length();
00593     const QChar *ampersand = 0;
00594     bool scanForSemicolon = false;
00595 
00596     for ( ; p < end; ++p ) {
00597         const QChar ch = *p;
00598 
00599         if ( ch == '&' ) {
00600             ampersand = p;
00601             scanForSemicolon = true;
00602             continue;
00603         }
00604 
00605         if ( ch != ';' || scanForSemicolon == false )
00606             continue;
00607 
00608         assert( ampersand );
00609 
00610         scanForSemicolon = false;
00611 
00612         const QChar *entityBegin = ampersand + 1;
00613 
00614         const uint entityLength = p - entityBegin;
00615         if ( entityLength == 0 )
00616             continue;
00617 
00618         const QChar entityValue = KCharsets::fromEntity( QString( entityBegin, entityLength ) );
00619         if ( entityValue.isNull() )
00620             continue;
00621 
00622         const uint ampersandPos = ampersand - text.unicode();
00623 
00624         text[ (int)ampersandPos ] = entityValue;
00625         text.remove( ampersandPos + 1, entityLength + 1 );
00626         p = text.unicode() + ampersandPos;
00627         end = text.unicode() + text.length();
00628         ampersand = 0;
00629     }
00630 
00631     return text;
00632 }
00633 
00634 QStringList KCharsets::availableEncodingNames() const
00635 {
00636     QStringList available;
00637     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2)
00638         available.append( QString::fromUtf8( language_for_encoding_string + *p ) );
00639     available.sort();
00640     return available;
00641 }
00642 
00643 QString KCharsets::languageForEncoding( const QString &encoding ) const
00644 {
00645     const char* lang = kcharsets_array_search( (const char*)language_for_encoding_string,
00646                                                language_for_encoding_indices,
00647                                                encoding.toUtf8().constData() );
00648     if ( lang )
00649         return i18nc( "@item Text character set", lang );
00650     else
00651         return i18nc( "@item Text character set", "Other" );
00652 }
00653 
00654 QString KCharsets::descriptionForEncoding( const QString& encoding ) const
00655 {
00656     const char* lang = kcharsets_array_search( language_for_encoding_string,
00657                                                language_for_encoding_indices,
00658                                                encoding.toUtf8() );
00659     if ( lang )
00660         return i18nc( "@item %1 character set, %2 encoding", "%1 ( %2 )",
00661                       i18nc( "@item Text character set", lang ), encoding );
00662     else
00663         return i18nc( "@item", "Other encoding (%1)", encoding );
00664 }
00665 
00666 QString KCharsets::encodingForName( const QString &descriptiveName ) const
00667 {
00668     const int left = descriptiveName.lastIndexOf( '(' );
00669 
00670     if (left<0) // No parenthesis, so assume it is a normal encoding name
00671     return descriptiveName.trimmed();
00672 
00673     QString name(descriptiveName.mid(left+1));
00674 
00675     const int right = name.lastIndexOf( ')' );
00676 
00677     if (right<0)
00678         return name;
00679 
00680     return name.left(right).trimmed();
00681 }
00682 
00683 QStringList KCharsets::descriptiveEncodingNames() const
00684 {
00685     QStringList encodings;
00686     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00687         const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00688         const QString description = i18nc( "@item Text character set", language_for_encoding_string + p[1] );
00689         encodings.append( i18nc( "@item Text encoding: %1 character set, %2 encoding", "%1 ( %2 )",
00690                                  description, name ) );
00691     }
00692     encodings.sort();
00693     return encodings;
00694 }
00695 
00696 QList<QStringList> KCharsets::encodingsByScript() const
00697 {
00698     if (!d->encodingsByScript.isEmpty())
00699         return d->encodingsByScript;
00700     int i;
00701     for ( const int *p = language_for_encoding_indices; *p != -1; p += 2) {
00702         const QString name = QString::fromUtf8( language_for_encoding_string + p[0] );
00703         const QString description = i18nc("@item Text character set", language_for_encoding_string + p[1] );
00704 
00705         for (i=0; i<d->encodingsByScript.size(); ++i) {
00706             if (d->encodingsByScript.at(i).at(0) == description) {
00707                 d->encodingsByScript[i].append(name);
00708                 break;
00709             }
00710         }
00711 
00712         if (i==d->encodingsByScript.size()) {
00713             d->encodingsByScript.append(QStringList() << description << name);
00714         }
00715 
00716     }
00717     return d->encodingsByScript;
00718 }
00719 
00720 QTextCodec* KCharsets::codecForName(const QString &n) const
00721 {
00722     const QByteArray name( n.toLatin1() );
00723     QTextCodec* codec = codecForNameOrNull( name );
00724     if ( codec )
00725         return codec;
00726     else
00727         return QTextCodec::codecForName( "iso8859-1" );
00728 }
00729 
00730 QTextCodec* KCharsets::codecForName(const QString &n, bool &ok) const
00731 {
00732     const QByteArray name( n.toLatin1() );
00733     QTextCodec* codec = codecForNameOrNull( name );
00734     if ( codec )
00735     {
00736         ok = true;
00737         return codec;
00738     }
00739     else
00740     {
00741         ok = false;
00742         return QTextCodec::codecForName( "iso8859-1" );
00743     }
00744 }
00745 
00746 QTextCodec *KCharsets::codecForNameOrNull( const QByteArray& n ) const
00747 {
00748     QTextCodec* codec = 0;
00749 
00750     if (n.isEmpty()) {
00751         // No name, assume locale (KDE's, not Qt's)
00752         const QByteArray locale = "->locale<-";
00753         if ( d->codecForNameDict.contains( locale ) )
00754             return d->codecForNameDict.value( locale );
00755         codec = KGlobal::locale()->codecForEncoding();
00756         d->codecForNameDict.insert("->locale<-", codec);
00757         return codec;
00758     }
00759     // For a non-empty name, lookup the "dictionnary", in a case-sensitive way.
00760     else if ( d->codecForNameDict.contains( n ) ) {
00761         return d->codecForNameDict.value( n );
00762     }
00763 
00764     // If the name is not in the hash table, call directly QTextCoded::codecForName.
00765     // We assume that QTextCodec is smarter and more maintained than this code.
00766     codec = QTextCodec::codecForName( n );
00767     if ( codec ) {
00768         d->codecForNameDict.insert( n, codec );
00769         return codec;
00770     }
00771 
00772     // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it.
00773 
00774     QByteArray name = n.toLower();
00775     bool changed = false;
00776     if (name.endsWith("_charset")) {
00777        name.chop( 8 );
00778        changed = true;
00779     }
00780     if ( name.startsWith( "x-" ) ) {
00781        name.remove( 0, 2 ); // remove x- at start
00782        changed = true;
00783     }
00784 
00785     if (name.isEmpty()) {
00786       // We have no name anymore, therefore the name is invalid.
00787       return 0;
00788     }
00789 
00790     // We only need to check changed names.
00791     if ( changed ) {
00792         codec = QTextCodec::codecForName(name);
00793         if (codec) {
00794             d->codecForNameDict.insert( n, codec );
00795             return codec;
00796         }
00797         changed = false;
00798     }
00799 
00800     // these codecs are built into Qt, but the name given for the codec is different,
00801     // so QTextCodec did not recognize it.
00802     QByteArray cname = kcharsets_array_search( builtin_string, builtin_indices, name);
00803 
00804     if(!cname.isEmpty())
00805         codec = QTextCodec::codecForName(cname);
00806 
00807     if (codec)
00808     {
00809         d->codecForNameDict.insert( n, codec );
00810         return codec;
00811     }
00812 
00813 #ifdef __GNUC__
00814 #warning is it still useful with Qt4 ?
00815 #endif
00816     //don't forget to remove the #if 0 on a few structs at the top also if you reenable that ;)  (search for 852 )
00817     //from what I understood, one needs to create a QTextCodecPlugin in order to be able to support a new Codec, but I do not 
00818     //know how to convert a charmap to a QTextCodec and the real big question is whether we need that at all ...  (mikmak)
00819         // Yes, it is useful (for examples EBCDIC in Kate or codepages for KOffice filters from/to MS formats) (goutte)
00820 #if 0
00821     QString dir;
00822     {
00823     KConfigGroup cg( KGlobal::config(), "i18n" );
00824     dir = cg.readPathEntry("i18ndir", QLatin1String("/usr/share/i18n/charmaps"));
00825     }
00826 
00827     // these are codecs not included in Qt. They can be build up if the corresponding charmap
00828     // is available in the charmap directory.
00829     cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
00830 
00831     if(cname.isEmpty())
00832         cname = name;
00833     cname = cname.toUpper();
00834 
00835     const QString basicName = QLatin1String(cname);
00836     kDebug() << endl << " Trying to find " << cname << " in " << dir;
00837 
00838     QString charMapFileName;
00839     bool gzipped = false;
00840     QDir qdir(dir);
00841     if (!qdir.exists()) {
00842         // The directory for the charmaps does not even exist... (That is common!)
00843     }
00844     else if (qdir.exists(basicName, false)) {
00845         charMapFileName = basicName;
00846     }
00847     else if (qdir.exists(basicName+".gz", false)) {
00848         charMapFileName = basicName + ".gz";
00849         gzipped = true;
00850     }
00851     else {
00852         // Check if we are asking a code page
00853         // If yes, then check "CP99999" and "IBM99999"
00854         // First we need to find the number of the codepage
00855         QRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+");
00856         if ( regexp.search(basicName) != -1) {
00857             const QString num = regexp.cap(4);
00858             if (num.isEmpty()) {
00859                 // No number, not a code page (or something went wrong)
00860             }
00861             else if (qdir.exists("IBM"+num)) {
00862                 charMapFileName = "IBM"+num;
00863             }
00864             else if (qdir.exists("IBM"+num+".gz")) {
00865                 charMapFileName = "IBM"+num+".gz";
00866                 gzipped = true;
00867             }
00868             else if (qdir.exists("CP"+num)) {
00869                 charMapFileName = "CP"+num;
00870             }
00871             else if (qdir.exists("CP"+num+".gz")) {
00872                 charMapFileName = "CP"+num+".gz";
00873                 gzipped = true;
00874             }
00875         }
00876     }
00877 
00878     if (gzipped && !charMapFileName.isEmpty()) {
00879         KFilterDev gzip(dir + '/' + charMapFileName);
00880         if (gzip.open(QIODevice::ReadOnly)) {
00881             kDebug() << "Loading gzipped charset...";
00882             codec = QTextCodec::loadCharmap(&gzip);
00883             gzip.close();
00884         }
00885         else
00886             kWarning() << "Could not open gzipped charset!";
00887     }
00888     else if (!charMapFileName.isEmpty()) {
00889         codec = QTextCodec::loadCharmapFile(dir + '/' + charMapFileName);
00890     }
00891 
00892     if(codec) {
00893         d->codecForNameDict.insert( n, codec );
00894         return codec;
00895     }
00896 #endif
00897 
00898     // this also failed, the last resort is now to take some compatibility charmap
00899     // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write.
00900     cname = kcharsets_array_search( conversion_hints_string, conversion_hints_indices, name );
00901 
00902     if (!cname.isEmpty()) {
00903         codec = QTextCodec::codecForName(cname);
00904         if (codec) {
00905             d->codecForNameDict.insert( n, codec );
00906             return codec;
00907         }
00908     }
00909 
00910     // we could not assign a codec, therefore return NULL
00911     return 0;
00912 }

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • KIO
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • Kross
  • KUtils
  • Nepomuk
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.5.4
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal