CrystalSpace

Public API Reference

Main Page   Modules   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

csutil/csuctransform.h

Go to the documentation of this file.
00001 /*
00002     Copyright (C) 2003 by Frank Richter
00003 
00004     This library is free software; you can redistribute it and/or
00005     modify it under the terms of the GNU Library General Public
00006     License as published by the Free Software Foundation; either
00007     version 2 of the License, or (at your option) any later version.
00008 
00009     This library is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012     Library General Public License for more details.
00013 
00014     You should have received a copy of the GNU Library General Public
00015     License along with this library; if not, write to the Free
00016     Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00017 */
00018 
00019 #ifndef __CS_CSUCTRANSFORM_H__
00020 #define __CS_CSUCTRANSFORM_H__
00021 
00022 #include "csunicode.h"
00023 
00031 
00032 #define CS_UC_MAX_UTF8_ENCODED          6
00033 
00034 #define CS_UC_MAX_UTF16_ENCODED         2
00035 
00036 #define CS_UC_MAX_UTF32_ENCODED         1
00037  
00041 class csUnicodeTransform
00042 {
00043 public:
00044 #define FAIL(ret)                               \
00045   {                                             \
00046     if (isValid) *isValid = false;              \
00047     ch = CS_UC_CHAR_REPLACER;                   \
00048     return ret;                                 \
00049   }
00050 
00051 #define SUCCEED                                 \
00052     if (isValid) *isValid = true;               \
00053     return chUsed;
00054   
00055 #define GET_NEXT(next)  \
00056   if ((size_t)chUsed == strlen)                 \
00057   {                                             \
00058     FAIL(chUsed);                               \
00059   }                                             \
00060   next = *str++;                                \
00061   if (next == 0)                                \
00062   {                                             \
00063     FAIL(chUsed);                               \
00064   }                                             \
00065   chUsed++;                                     
00066   
00082   inline static int UTF8Decode (const utf8_char* str, size_t strlen, 
00083     utf32_char& ch, bool* isValid = 0)
00084   {
00085     if (str == 0)
00086     {
00087       FAIL(0);
00088     }
00089     int chUsed = 0;
00090     
00091     utf8_char curCh;
00092     GET_NEXT(curCh);
00093     if ((curCh & 0x80) == 0)
00094     {
00095       // easy case
00096       ch = curCh;
00097       SUCCEED;
00098     }
00099     else
00100     {
00101       // Count with how many bytes this char is encoded.
00102       int n = 0;
00103       while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; }
00104 
00105       if ((n < 2) || (n > 6))
00106       {
00107         // Invalid code: first char of a "sequence" must have
00108         // at least two and at most six MSBs set
00109         FAIL(1);
00110       }
00111 
00112       ch = (curCh & ((1 << (8 - n)) - 1));
00113       
00114       for (int i = 1; i < n; i++)
00115       {
00116         GET_NEXT(curCh);
00117         if ((curCh & 0xc0) != 0x80)
00118         {
00119           FAIL(chUsed);
00120         }
00121         else
00122         {
00123           ch <<= 6;
00124           ch |= (curCh & 0x3f);
00125         }
00126       }
00127       
00128       // Check for "overlong" codes.
00129       if ((ch < 0x80) && (n > 0))
00130       {
00131         FAIL(chUsed);
00132       }
00133       else if ((ch < 0x800) && (n > 2))
00134       {
00135         FAIL(chUsed);
00136       }
00137       else if ((ch < 0x10000) && (n > 3))
00138       {
00139         FAIL(chUsed);
00140       }
00141       else if ((ch < 0x200000) && (n > 4))
00142       {
00143         FAIL(chUsed);
00144       }
00145       else if ((ch < 0x4000000) && (n > 5))
00146       {
00147         FAIL(chUsed);
00148       }
00149       else if ((ch < 0x80000000) && (n > 6))
00150       {
00151         FAIL(chUsed);
00152       }
00153       
00154       if (CS_UC_IS_INVALID(ch) || CS_UC_IS_SURROGATE(ch))
00155         FAIL(chUsed);
00156       SUCCEED;
00157     }
00158   }
00159   
00164   inline static int UTF16Decode (const utf16_char* str, size_t strlen, 
00165     utf32_char& ch, bool* isValid = 0)
00166   {
00167     if (str == 0)
00168     {
00169       FAIL(0);
00170     }
00171     int chUsed = 0;
00172     
00173     utf16_char curCh;
00174     GET_NEXT(curCh);
00175     // Decode surrogate
00176     if (CS_UC_IS_SURROGATE (curCh))
00177     {
00178       // Invalid code
00179       if (!CS_UC_IS_HIGH_SURROGATE (curCh))
00180       {
00181         FAIL(chUsed);
00182       }
00183       ch = (curCh & 0x03ff) << 10;
00184       GET_NEXT(curCh);
00185       // Invalid code
00186       if (!CS_UC_IS_LOW_SURROGATE (curCh))
00187       {
00188         // Fail with 1 so the char is handled upon the next Decode.
00189         FAIL(1);
00190       }
00191       ch |= (curCh & 0x3ff);
00192       // Check for "overlong" codes
00193       if ((ch == 0) || (ch < 0x10000))
00194         FAIL(chUsed);
00195     }
00196     else
00197     {
00198       ch = curCh;
00199     }
00200     if (CS_UC_IS_INVALID(ch))
00201       FAIL(chUsed);
00202     SUCCEED;
00203   }
00204   
00209   inline static int UTF32Decode (const utf32_char* str, size_t strlen, 
00210     utf32_char& ch, bool* isValid = 0)
00211   {
00212     if (str == 0)
00213     {
00214       FAIL(0);
00215     }
00216     int chUsed = 0;
00217     
00218     GET_NEXT(ch);
00219     if (CS_UC_IS_INVALID(ch))
00220       FAIL(chUsed);
00221     SUCCEED;
00222   }
00224 #undef FAIL
00225 #undef SUCCEED
00226 #undef GET_NEXT
00227 
00228 #define _OUTPUT_CHAR(buf, chr)                          \
00229   if (bufRemaining > 0)                                 \
00230   {                                                     \
00231     if(buf) *buf++ = chr;                               \
00232     bufRemaining--;                                     \
00233   }                                                     \
00234   encodedLen++;
00235 
00236 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr)
00237   
00250   inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf, 
00251     size_t bufsize)
00252   {
00253     if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 
00254       return 0;
00255     size_t bufRemaining = bufsize;
00256     int encodedLen = 0;
00257     
00258     if (ch < 0x80)
00259     {
00260       OUTPUT_CHAR ((utf8_char)ch);
00261     }
00262     else if (ch < 0x800)
00263     {
00264       OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6)));
00265       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00266     }
00267     else if (ch < 0x10000)
00268     {
00269       OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12)));
00270       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00271       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00272     }
00273     else if (ch < 0x200000)
00274     {
00275       OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18)));
00276       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00277       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00278       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00279     }
00280     else if (ch < 0x4000000)
00281     {
00282       OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24)));
00283       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00284       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00285       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00286       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00287     }
00288     else if (ch < 0x80000000)
00289     {
00290       OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30)));
00291       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f)));
00292       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00293       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00294       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00295       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00296     }
00297     return encodedLen;
00298   }
00299     
00304   inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf, 
00305     size_t bufsize)
00306   {
00307     if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 
00308       return 0;
00309     size_t bufRemaining = bufsize;
00310     int encodedLen = 0;
00311     
00312     if (ch < 0x10000)
00313     {
00314       OUTPUT_CHAR((utf16_char)ch);
00315     }
00316     else if (ch < 0x100000)
00317     {
00318       OUTPUT_CHAR((utf16_char)((ch >> 10) | CS_UC_CHAR_HIGH_SURROGATE_FIRST));
00319       OUTPUT_CHAR((utf16_char)((ch & 0x3ff) | CS_UC_CHAR_LOW_SURROGATE_FIRST));
00320     }
00321     else
00322       return 0;
00323     
00324     return encodedLen;
00325   }
00326 
00331   inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf, 
00332     size_t bufsize)
00333   {
00334     if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 
00335       return 0;
00336     size_t bufRemaining = bufsize;
00337     int encodedLen = 0;
00338     
00339     OUTPUT_CHAR(ch);
00340     
00341     return encodedLen;
00342   }
00344 #undef OUTPUT_CHAR
00345   
00346 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr)
00347   
00348 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder)    \
00349   inline static size_t funcName (toType* dest, size_t destSize,         \
00350     const fromType* source, size_t srcSize = (size_t)-1)                \
00351   {                                                                     \
00352     if ((srcSize == 0) || (source == 0))                                \
00353       return 0;                                                         \
00354                                                                         \
00355     size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0;            \
00356     size_t encodedLen = 0;                                              \
00357                                                                         \
00358     size_t srcChars = srcSize;                                          \
00359                                                                         \
00360     if (srcSize == (size_t)-1)                                          \
00361     {                                                                   \
00362       srcChars = 0;                                                     \
00363       const fromType* sptr = source;                                    \
00364       while (*sptr++ != 0) srcChars++;                                  \
00365     }                                                                   \
00366                                                                         \
00367     while (srcChars > 0)                                                \
00368     {                                                                   \
00369       utf32_char ch;                                                    \
00370       int scnt = decoder (source, srcChars, ch, 0);                     \
00371       if (scnt == 0) break;                                             \
00372       int dcnt = encoder (ch, dest, bufRemaining);                      \
00373       if (dcnt == 0)                                                    \
00374       {                                                                 \
00375         dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining);       \
00376       }                                                                 \
00377                                                                         \
00378       if ((size_t)dcnt >= bufRemaining)                                 \
00379       {                                                                 \
00380         if (dest && (destSize > 0)) dest += bufRemaining;               \
00381         bufRemaining = 0;                                               \
00382       }                                                                 \
00383       else                                                              \
00384       {                                                                 \
00385         bufRemaining -= dcnt;                                           \
00386         if (dest && (destSize > 0)) dest += dcnt;                       \
00387       }                                                                 \
00388       encodedLen += dcnt;                                               \
00389       if ((size_t)scnt >= srcChars) break;                              \
00390       srcChars -= scnt;                                                 \
00391       source += scnt;                                                   \
00392     }                                                                   \
00393                                                                         \
00394     if (dest) *dest = 0;                                                \
00395                                                                         \
00396     return encodedLen + 1;                                              \
00397   }
00398 
00416   UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16);
00421   UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32);
00422 
00427   UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8);
00432   UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32);
00433   
00438   UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8);
00443   UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16);
00446 #undef UCTF_CONVERTER
00447 #undef OUTPUT_CHAR
00448 
00449 #if (CS_WCHAR_T_SIZE == 1)
00450   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00451     const utf8_char* source, size_t srcSize)
00452   {
00453     size_t srcChars = srcSize;                                          
00454     if (srcSize == (size_t)-1)                                          
00455     {                                                                   
00456       srcChars = 0;                                                     
00457       const utf8_char* sptr = source;                                   
00458       while (*sptr++ != 0) srcChars++;                                  
00459     }                           
00460     if ((dest != 0) && (destSize != 0))
00461     {
00462       size_t len = MIN (destSize - 1, srcChars);
00463       memcpy (dest, source, size * sizeof (wchar_t));
00464       *(dest + len) = 0;
00465     }
00466     return srcChars + 1;
00467   };
00468 
00469   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00470     const utf16_char* source, size_t srcSize)
00471   {
00472     return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize);
00473   };
00474 
00475   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00476     const utf32_char* source, size_t srcSize)
00477   {
00478     return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize);
00479   };
00480   
00481   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00482     const wchar_t* source, size_t srcSize)
00483   {
00484     size_t srcChars = srcSize;                                          
00485     if (srcSize == (size_t)-1)                                          
00486     {                                                                   
00487       srcChars = 0;                                                     
00488       const wchar_t* sptr = source;                                     
00489       while (*sptr++ != 0) srcChars++;                                  
00490     }                           
00491     if ((dest != 0) && (destSize != 0))
00492     {
00493       size_t len = MIN (destSize - 1, srcChars);
00494       memcpy (dest, source, len * sizeof (wchar_t));
00495       *(dest + len) = 0;
00496     }
00497     return srcChars + 1;
00498   };
00499 
00500   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00501     const wchar_t* source, size_t srcSize)
00502   {
00503     return UTF8to16 (dest, destSize, source, srcSize);
00504   };
00505 
00506   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00507     const wchar_t* source, size_t srcSize)
00508   {
00509     return UTF8to32 (dest, destSize, source, srcSize);
00510   };
00511 #elif (CS_WCHAR_T_SIZE == 2)
00512   // Methods below for doxygen documentation are here as the size '2' is 
00513   // default.
00514   
00521   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00522     const utf8_char* source, size_t srcSize)
00523   {
00524     return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize);
00525   };
00526 
00531   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00532     const utf16_char* source, size_t srcSize)
00533   {
00534     size_t srcChars = srcSize;                                          
00535     if (srcSize == (size_t)-1)                                          
00536     {                                                                   
00537       srcChars = 0;                                                     
00538       const utf16_char* sptr = source;                                  
00539       while (*sptr++ != 0) srcChars++;                                  
00540     }                           
00541     if ((dest != 0) && (destSize != 0))
00542     {
00543       size_t len = MIN (destSize - 1, srcChars);
00544       memcpy (dest, source, len * sizeof (wchar_t));
00545       *(dest + len) = 0;
00546     }
00547     return srcChars + 1;
00548   };
00549 
00554   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00555     const utf32_char* source, size_t srcSize)
00556   {
00557     return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize);
00558   };
00559   
00564   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00565     const wchar_t* source, size_t srcSize)
00566   {
00567     return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize);
00568   };
00569 
00574   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00575     const wchar_t* source, size_t srcSize)
00576   {
00577     size_t srcChars = srcSize;                                          
00578     if (srcSize == (size_t)-1)                                          
00579     {                                                                   
00580       srcChars = 0;                                                     
00581       const wchar_t* sptr = source;                                     
00582       while (*sptr++ != 0) srcChars++;                                  
00583     }                           
00584     if ((dest != 0) && (destSize != 0))
00585     {
00586       size_t len = MIN (destSize - 1, srcChars);
00587       memcpy (dest, source, len * sizeof (wchar_t));
00588       *(dest + len) = 0;
00589     }
00590     return srcChars + 1;
00591   };
00592 
00597   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00598     const wchar_t* source, size_t srcSize)
00599   {
00600     return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize);
00601   };
00603 #elif (CS_WCHAR_T_SIZE == 4)
00604   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00605     const utf8_char* source, size_t srcSize)
00606   {
00607     return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize);
00608   };
00609 
00610   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00611     const utf16_char* source, size_t srcSize)
00612   {
00613     return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize);
00614   };
00615 
00616   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00617     const utf32_char* source,  size_t srcSize)
00618   {
00619     size_t srcChars = srcSize;                                          
00620     if (srcSize == (size_t)-1)                                          
00621     {                                                                   
00622       srcChars = 0;                                                     
00623       const utf32_char* sptr = source;                                  
00624       while (*sptr++ != 0) srcChars++;                                  
00625     }                           
00626     if ((dest != 0) && (destSize != 0))
00627     {
00628       size_t len = MIN (destSize - 1, srcChars);
00629       memcpy (dest, source, len * sizeof (wchar_t));
00630       *(dest + len) = 0;
00631     }
00632     return srcChars + 1;
00633   };
00634   
00635   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00636     const wchar_t* source, size_t srcSize)
00637   {
00638     return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize);
00639   };
00640 
00641   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00642     const wchar_t* source, size_t srcSize)
00643   {
00644     return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize);
00645   };
00646 
00647   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00648     const wchar_t* source, size_t srcSize)
00649   {
00650     size_t srcChars = srcSize;                                          
00651     if (srcSize == (size_t)-1)                                          
00652     {                                                                   
00653       srcChars = 0;                                                     
00654       const wchar_t* sptr = source;                                     
00655       while (*sptr++ != 0) srcChars++;                                  
00656     }                           
00657     if ((dest != 0) && (destSize != 0))
00658     {
00659       size_t len = MIN (destSize - 1, srcChars);
00660       memcpy (dest, source, len * sizeof (wchar_t));
00661       *(dest + len) = 0;
00662     }
00663     return srcChars + 1;
00664   };
00665 #else
00666   #error Odd-sized, unsupported wchar_t!
00667 #endif
00668 
00681   inline static int UTF8Skip (const utf8_char* str, size_t maxSkip)
00682   {
00683     if (maxSkip < 1) return 0;
00684   
00685     if ((*str & 0x80) == 0)
00686     {
00687       return 1;
00688     }
00689     else
00690     {
00691       int n = 0;
00692       while ((n < 7) && ((*str & (1 << (7 - n))) != 0)) { n++; }
00693 
00694       if ((n < 2) || (n > 6))
00695       {
00696         return 1;
00697       }
00698 
00699       int skip = 1;
00700       
00701       for (; skip < n; skip++)
00702       {
00703         if (((str[skip] & 0xc0) != 0x80) || ((size_t)skip > maxSkip))
00704         {
00705           break;
00706         }
00707       }
00708       return skip;
00709     }
00710   }
00711   
00721   inline static int UTF8Rewind (const utf8_char* str, size_t maxRew)
00722   {
00723     if (maxRew < 1) return 0;
00724     
00725     const utf8_char* pos = str - 1;
00726     
00727     if ((*pos & 0x80) == 0)
00728     {
00729       return 1;
00730     }
00731     
00732     // Skip backward to the first byte of the sequence.
00733     int skip = 1;
00734     while (((*pos & 0xc0) == 0x80) && ((size_t)skip < maxRew))
00735     {
00736       skip++;
00737       pos--;
00738     }
00739     
00740     return skip;
00741   }
00742   
00748   inline static int UTF16Skip (const utf16_char* str, size_t maxSkip)
00749   {
00750     if (CS_UC_IS_HIGH_SURROGATE (*str))
00751       return (int)(MIN(maxSkip, 2));
00752     else
00753       return (int)(MIN(maxSkip, 1));
00754   }
00755   
00761   inline static int UTF16Rewind (const utf16_char* str, size_t maxRew)
00762   {
00763     if (maxRew < 1) return 0;
00764     
00765     const utf16_char* pos = str - 1;
00766     if (!CS_UC_IS_SURROGATE(*pos)) 
00767       return 1;
00768     else
00769     {
00770       if ((maxRew > 1) && (CS_UC_IS_HIGH_SURROGATE(*(pos - 1))))
00771         return 2;
00772       else
00773         return 1;
00774     }
00775   }
00776   
00782   inline static int UTF32Skip (const utf32_char* str, size_t maxSkip)
00783   {
00784     return (int)(MIN(maxSkip, 1));
00785   }
00786 
00792   inline static int UTF32Rewind (const utf32_char* str, size_t maxRew)
00793   {
00794     if (maxRew < 1) return 0;
00795     return 1;
00796   }
00798 };
00799 
00802 #endif
00803 

Generated for Crystal Space by doxygen 1.2.18