csutil/csuctransform.h
Go to the documentation of this file.00001 /* 00002 Copyright (C) 2003 by Frank Richter 00003 00004 This library is free software; you can redistribute it and/or 00005 modify it under the terms of the GNU Library General Public 00006 License as published by the Free Software Foundation; either 00007 version 2 of the License, or (at your option) any later version. 00008 00009 This library is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00012 Library General Public License for more details. 00013 00014 You should have received a copy of the GNU Library General Public 00015 License along with this library; if not, write to the Free 00016 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00017 */ 00018 00019 #ifndef __CS_CSUCTRANSFORM_H__ 00020 #define __CS_CSUCTRANSFORM_H__ 00021 00022 #include "csunicode.h" 00023 00031 00032 #define CS_UC_MAX_UTF8_ENCODED 6 00033 00034 #define CS_UC_MAX_UTF16_ENCODED 2 00035 00036 #define CS_UC_MAX_UTF32_ENCODED 1 00037 00041 class csUnicodeTransform 00042 { 00043 public: 00044 #define FAIL(ret) \ 00045 { \ 00046 if (isValid) *isValid = false; \ 00047 ch = CS_UC_CHAR_REPLACER; \ 00048 return ret; \ 00049 } 00050 00051 #define SUCCEED \ 00052 if (isValid) *isValid = true; \ 00053 return chUsed; 00054 00055 #define GET_NEXT(next) \ 00056 if ((size_t)chUsed == strlen) \ 00057 { \ 00058 FAIL(chUsed); \ 00059 } \ 00060 next = *str++; \ 00061 if (next == 0) \ 00062 { \ 00063 FAIL(chUsed); \ 00064 } \ 00065 chUsed++; 00066 00082 inline static int UTF8Decode (const utf8_char* str, size_t strlen, 00083 utf32_char& ch, bool* isValid = 0) 00084 { 00085 if (str == 0) 00086 { 00087 FAIL(0); 00088 } 00089 int chUsed = 0; 00090 00091 utf8_char curCh; 00092 GET_NEXT(curCh); 00093 if ((curCh & 0x80) == 0) 00094 { 00095 // easy case 00096 ch = curCh; 00097 SUCCEED; 00098 } 00099 else 00100 { 00101 // Count with how many bytes this char is encoded. 00102 int n = 0; 00103 while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; } 00104 00105 if ((n < 2) || (n > 6)) 00106 { 00107 // Invalid code: first char of a "sequence" must have 00108 // at least two and at most six MSBs set 00109 FAIL(1); 00110 } 00111 00112 ch = (curCh & ((1 << (8 - n)) - 1)); 00113 00114 for (int i = 1; i < n; i++) 00115 { 00116 GET_NEXT(curCh); 00117 if ((curCh & 0xc0) != 0x80) 00118 { 00119 FAIL(chUsed); 00120 } 00121 else 00122 { 00123 ch <<= 6; 00124 ch |= (curCh & 0x3f); 00125 } 00126 } 00127 00128 // Check for "overlong" codes. 00129 if ((ch < 0x80) && (n > 0)) 00130 { 00131 FAIL(chUsed); 00132 } 00133 else if ((ch < 0x800) && (n > 2)) 00134 { 00135 FAIL(chUsed); 00136 } 00137 else if ((ch < 0x10000) && (n > 3)) 00138 { 00139 FAIL(chUsed); 00140 } 00141 else if ((ch < 0x200000) && (n > 4)) 00142 { 00143 FAIL(chUsed); 00144 } 00145 else if ((ch < 0x4000000) && (n > 5)) 00146 { 00147 FAIL(chUsed); 00148 } 00149 else if ((ch < 0x80000000) && (n > 6)) 00150 { 00151 FAIL(chUsed); 00152 } 00153 00154 if (CS_UC_IS_INVALID(ch) || CS_UC_IS_SURROGATE(ch)) 00155 FAIL(chUsed); 00156 SUCCEED; 00157 } 00158 } 00159 00164 inline static int UTF16Decode (const utf16_char* str, size_t strlen, 00165 utf32_char& ch, bool* isValid = 0) 00166 { 00167 if (str == 0) 00168 { 00169 FAIL(0); 00170 } 00171 int chUsed = 0; 00172 00173 utf16_char curCh; 00174 GET_NEXT(curCh); 00175 // Decode surrogate 00176 if (CS_UC_IS_SURROGATE (curCh)) 00177 { 00178 // Invalid code 00179 if (!CS_UC_IS_HIGH_SURROGATE (curCh)) 00180 { 00181 FAIL(chUsed); 00182 } 00183 ch = (curCh & 0x03ff) << 10; 00184 GET_NEXT(curCh); 00185 // Invalid code 00186 if (!CS_UC_IS_LOW_SURROGATE (curCh)) 00187 { 00188 // Fail with 1 so the char is handled upon the next Decode. 00189 FAIL(1); 00190 } 00191 ch |= (curCh & 0x3ff); 00192 // Check for "overlong" codes 00193 if ((ch == 0) || (ch < 0x10000)) 00194 FAIL(chUsed); 00195 } 00196 else 00197 { 00198 ch = curCh; 00199 } 00200 if (CS_UC_IS_INVALID(ch)) 00201 FAIL(chUsed); 00202 SUCCEED; 00203 } 00204 00209 inline static int UTF32Decode (const utf32_char* str, size_t strlen, 00210 utf32_char& ch, bool* isValid = 0) 00211 { 00212 if (str == 0) 00213 { 00214 FAIL(0); 00215 } 00216 int chUsed = 0; 00217 00218 GET_NEXT(ch); 00219 if (CS_UC_IS_INVALID(ch)) 00220 FAIL(chUsed); 00221 SUCCEED; 00222 } 00224 #undef FAIL 00225 #undef SUCCEED 00226 #undef GET_NEXT 00227 00228 #define _OUTPUT_CHAR(buf, chr) \ 00229 if (bufRemaining > 0) \ 00230 { \ 00231 if(buf) *buf++ = chr; \ 00232 bufRemaining--; \ 00233 } \ 00234 encodedLen++; 00235 00236 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr) 00237 00250 inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf, 00251 size_t bufsize) 00252 { 00253 if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 00254 return 0; 00255 size_t bufRemaining = bufsize; 00256 int encodedLen = 0; 00257 00258 if (ch < 0x80) 00259 { 00260 OUTPUT_CHAR ((utf8_char)ch); 00261 } 00262 else if (ch < 0x800) 00263 { 00264 OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6))); 00265 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00266 } 00267 else if (ch < 0x10000) 00268 { 00269 OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12))); 00270 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00271 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00272 } 00273 else if (ch < 0x200000) 00274 { 00275 OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18))); 00276 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00277 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00278 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00279 } 00280 else if (ch < 0x4000000) 00281 { 00282 OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24))); 00283 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00284 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00285 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00286 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00287 } 00288 else if (ch < 0x80000000) 00289 { 00290 OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30))); 00291 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f))); 00292 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00293 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00294 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00295 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00296 } 00297 return encodedLen; 00298 } 00299 00304 inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf, 00305 size_t bufsize) 00306 { 00307 if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 00308 return 0; 00309 size_t bufRemaining = bufsize; 00310 int encodedLen = 0; 00311 00312 if (ch < 0x10000) 00313 { 00314 OUTPUT_CHAR((utf16_char)ch); 00315 } 00316 else if (ch < 0x100000) 00317 { 00318 OUTPUT_CHAR((utf16_char)((ch >> 10) | CS_UC_CHAR_HIGH_SURROGATE_FIRST)); 00319 OUTPUT_CHAR((utf16_char)((ch & 0x3ff) | CS_UC_CHAR_LOW_SURROGATE_FIRST)); 00320 } 00321 else 00322 return 0; 00323 00324 return encodedLen; 00325 } 00326 00331 inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf, 00332 size_t bufsize) 00333 { 00334 if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 00335 return 0; 00336 size_t bufRemaining = bufsize; 00337 int encodedLen = 0; 00338 00339 OUTPUT_CHAR(ch); 00340 00341 return encodedLen; 00342 } 00344 #undef OUTPUT_CHAR 00345 00346 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr) 00347 00348 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder) \ 00349 inline static size_t funcName (toType* dest, size_t destSize, \ 00350 const fromType* source, size_t srcSize = (size_t)-1) \ 00351 { \ 00352 if ((srcSize == 0) || (source == 0)) \ 00353 return 0; \ 00354 \ 00355 size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0; \ 00356 size_t encodedLen = 0; \ 00357 \ 00358 size_t srcChars = srcSize; \ 00359 \ 00360 if (srcSize == (size_t)-1) \ 00361 { \ 00362 srcChars = 0; \ 00363 const fromType* sptr = source; \ 00364 while (*sptr++ != 0) srcChars++; \ 00365 } \ 00366 \ 00367 while (srcChars > 0) \ 00368 { \ 00369 utf32_char ch; \ 00370 int scnt = decoder (source, srcChars, ch, 0); \ 00371 if (scnt == 0) break; \ 00372 int dcnt = encoder (ch, dest, bufRemaining); \ 00373 if (dcnt == 0) \ 00374 { \ 00375 dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining); \ 00376 } \ 00377 \ 00378 if ((size_t)dcnt >= bufRemaining) \ 00379 { \ 00380 if (dest && (destSize > 0)) dest += bufRemaining; \ 00381 bufRemaining = 0; \ 00382 } \ 00383 else \ 00384 { \ 00385 bufRemaining -= dcnt; \ 00386 if (dest && (destSize > 0)) dest += dcnt; \ 00387 } \ 00388 encodedLen += dcnt; \ 00389 if ((size_t)scnt >= srcChars) break; \ 00390 srcChars -= scnt; \ 00391 source += scnt; \ 00392 } \ 00393 \ 00394 if (dest) *dest = 0; \ 00395 \ 00396 return encodedLen + 1; \ 00397 } 00398 00416 UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16); 00421 UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32); 00422 00427 UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8); 00432 UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32); 00433 00438 UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8); 00443 UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16); 00446 #undef UCTF_CONVERTER 00447 #undef OUTPUT_CHAR 00448 00449 #if (CS_WCHAR_T_SIZE == 1) 00450 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00451 const utf8_char* source, size_t srcSize) 00452 { 00453 size_t srcChars = srcSize; 00454 if (srcSize == (size_t)-1) 00455 { 00456 srcChars = 0; 00457 const utf8_char* sptr = source; 00458 while (*sptr++ != 0) srcChars++; 00459 } 00460 if ((dest != 0) && (destSize != 0)) 00461 { 00462 size_t len = MIN (destSize - 1, srcChars); 00463 memcpy (dest, source, size * sizeof (wchar_t)); 00464 *(dest + len) = 0; 00465 } 00466 return srcChars + 1; 00467 }; 00468 00469 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00470 const utf16_char* source, size_t srcSize) 00471 { 00472 return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize); 00473 }; 00474 00475 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00476 const utf32_char* source, size_t srcSize) 00477 { 00478 return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize); 00479 }; 00480 00481 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00482 const wchar_t* source, size_t srcSize) 00483 { 00484 size_t srcChars = srcSize; 00485 if (srcSize == (size_t)-1) 00486 { 00487 srcChars = 0; 00488 const wchar_t* sptr = source; 00489 while (*sptr++ != 0) srcChars++; 00490 } 00491 if ((dest != 0) && (destSize != 0)) 00492 { 00493 size_t len = MIN (destSize - 1, srcChars); 00494 memcpy (dest, source, len * sizeof (wchar_t)); 00495 *(dest + len) = 0; 00496 } 00497 return srcChars + 1; 00498 }; 00499 00500 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00501 const wchar_t* source, size_t srcSize) 00502 { 00503 return UTF8to16 (dest, destSize, source, srcSize); 00504 }; 00505 00506 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00507 const wchar_t* source, size_t srcSize) 00508 { 00509 return UTF8to32 (dest, destSize, source, srcSize); 00510 }; 00511 #elif (CS_WCHAR_T_SIZE == 2) 00512 // Methods below for doxygen documentation are here as the size '2' is 00513 // default. 00514 00521 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00522 const utf8_char* source, size_t srcSize) 00523 { 00524 return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize); 00525 }; 00526 00531 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00532 const utf16_char* source, size_t srcSize) 00533 { 00534 size_t srcChars = srcSize; 00535 if (srcSize == (size_t)-1) 00536 { 00537 srcChars = 0; 00538 const utf16_char* sptr = source; 00539 while (*sptr++ != 0) srcChars++; 00540 } 00541 if ((dest != 0) && (destSize != 0)) 00542 { 00543 size_t len = MIN (destSize - 1, srcChars); 00544 memcpy (dest, source, len * sizeof (wchar_t)); 00545 *(dest + len) = 0; 00546 } 00547 return srcChars + 1; 00548 }; 00549 00554 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00555 const utf32_char* source, size_t srcSize) 00556 { 00557 return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize); 00558 }; 00559 00564 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00565 const wchar_t* source, size_t srcSize) 00566 { 00567 return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize); 00568 }; 00569 00574 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00575 const wchar_t* source, size_t srcSize) 00576 { 00577 size_t srcChars = srcSize; 00578 if (srcSize == (size_t)-1) 00579 { 00580 srcChars = 0; 00581 const wchar_t* sptr = source; 00582 while (*sptr++ != 0) srcChars++; 00583 } 00584 if ((dest != 0) && (destSize != 0)) 00585 { 00586 size_t len = MIN (destSize - 1, srcChars); 00587 memcpy (dest, source, len * sizeof (wchar_t)); 00588 *(dest + len) = 0; 00589 } 00590 return srcChars + 1; 00591 }; 00592 00597 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00598 const wchar_t* source, size_t srcSize) 00599 { 00600 return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize); 00601 }; 00603 #elif (CS_WCHAR_T_SIZE == 4) 00604 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00605 const utf8_char* source, size_t srcSize) 00606 { 00607 return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize); 00608 }; 00609 00610 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00611 const utf16_char* source, size_t srcSize) 00612 { 00613 return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize); 00614 }; 00615 00616 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00617 const utf32_char* source, size_t srcSize) 00618 { 00619 size_t srcChars = srcSize; 00620 if (srcSize == (size_t)-1) 00621 { 00622 srcChars = 0; 00623 const utf32_char* sptr = source; 00624 while (*sptr++ != 0) srcChars++; 00625 } 00626 if ((dest != 0) && (destSize != 0)) 00627 { 00628 size_t len = MIN (destSize - 1, srcChars); 00629 memcpy (dest, source, len * sizeof (wchar_t)); 00630 *(dest + len) = 0; 00631 } 00632 return srcChars + 1; 00633 }; 00634 00635 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00636 const wchar_t* source, size_t srcSize) 00637 { 00638 return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize); 00639 }; 00640 00641 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00642 const wchar_t* source, size_t srcSize) 00643 { 00644 return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize); 00645 }; 00646 00647 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00648 const wchar_t* source, size_t srcSize) 00649 { 00650 size_t srcChars = srcSize; 00651 if (srcSize == (size_t)-1) 00652 { 00653 srcChars = 0; 00654 const wchar_t* sptr = source; 00655 while (*sptr++ != 0) srcChars++; 00656 } 00657 if ((dest != 0) && (destSize != 0)) 00658 { 00659 size_t len = MIN (destSize - 1, srcChars); 00660 memcpy (dest, source, len * sizeof (wchar_t)); 00661 *(dest + len) = 0; 00662 } 00663 return srcChars + 1; 00664 }; 00665 #else 00666 #error Odd-sized, unsupported wchar_t! 00667 #endif 00668 00681 inline static int UTF8Skip (const utf8_char* str, size_t maxSkip) 00682 { 00683 if (maxSkip < 1) return 0; 00684 00685 if ((*str & 0x80) == 0) 00686 { 00687 return 1; 00688 } 00689 else 00690 { 00691 int n = 0; 00692 while ((n < 7) && ((*str & (1 << (7 - n))) != 0)) { n++; } 00693 00694 if ((n < 2) || (n > 6)) 00695 { 00696 return 1; 00697 } 00698 00699 int skip = 1; 00700 00701 for (; skip < n; skip++) 00702 { 00703 if (((str[skip] & 0xc0) != 0x80) || ((size_t)skip > maxSkip)) 00704 { 00705 break; 00706 } 00707 } 00708 return skip; 00709 } 00710 } 00711 00721 inline static int UTF8Rewind (const utf8_char* str, size_t maxRew) 00722 { 00723 if (maxRew < 1) return 0; 00724 00725 const utf8_char* pos = str - 1; 00726 00727 if ((*pos & 0x80) == 0) 00728 { 00729 return 1; 00730 } 00731 00732 // Skip backward to the first byte of the sequence. 00733 int skip = 1; 00734 while (((*pos & 0xc0) == 0x80) && ((size_t)skip < maxRew)) 00735 { 00736 skip++; 00737 pos--; 00738 } 00739 00740 return skip; 00741 } 00742 00748 inline static int UTF16Skip (const utf16_char* str, size_t maxSkip) 00749 { 00750 if (CS_UC_IS_HIGH_SURROGATE (*str)) 00751 return (int)(MIN(maxSkip, 2)); 00752 else 00753 return (int)(MIN(maxSkip, 1)); 00754 } 00755 00761 inline static int UTF16Rewind (const utf16_char* str, size_t maxRew) 00762 { 00763 if (maxRew < 1) return 0; 00764 00765 const utf16_char* pos = str - 1; 00766 if (!CS_UC_IS_SURROGATE(*pos)) 00767 return 1; 00768 else 00769 { 00770 if ((maxRew > 1) && (CS_UC_IS_HIGH_SURROGATE(*(pos - 1)))) 00771 return 2; 00772 else 00773 return 1; 00774 } 00775 } 00776 00782 inline static int UTF32Skip (const utf32_char* str, size_t maxSkip) 00783 { 00784 return (int)(MIN(maxSkip, 1)); 00785 } 00786 00792 inline static int UTF32Rewind (const utf32_char* str, size_t maxRew) 00793 { 00794 if (maxRew < 1) return 0; 00795 return 1; 00796 } 00798 }; 00799 00802 #endif 00803
Generated for Crystal Space by doxygen 1.2.18