file/src/ascmagic.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) Ian F. Darwin 1986-1995.
00003  * Software written by Ian F. Darwin and others;
00004  * maintained 1995-present by Christos Zoulas and others.
00005  * 
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  * 1. Redistributions of source code must retain the above copyright
00010  *    notice immediately at the beginning of the file, without modification,
00011  *    this list of conditions, and the following disclaimer.
00012  * 2. Redistributions in binary form must reproduce the above copyright
00013  *    notice, this list of conditions and the following disclaimer in the
00014  *    documentation and/or other materials provided with the distribution.
00015  *  
00016  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00017  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00018  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00019  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
00020  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00021  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00022  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00023  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00024  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00025  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00026  * SUCH DAMAGE.
00027  */
00028 /*
00029  * ASCII magic -- file types that we know based on keywords
00030  * that can appear anywhere in the file.
00031  *
00032  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
00033  * to handle character codes other than ASCII on a unified basis.
00034  *
00035  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
00036  * international characters, now subsumed into this file.
00037  */
00038 
00039 #include "file.h"
00040 #include "magic.h"
00041 #include <stdio.h>
00042 #include <string.h>
00043 #include <memory.h>
00044 #include <ctype.h>
00045 #include <stdlib.h>
00046 #ifdef HAVE_UNISTD_H
00047 #include <unistd.h>
00048 #endif
00049 #include "names.h"
00050 
00051 #ifndef lint
00052 FILE_RCSID("@(#)$File: ascmagic.c,v 1.50 2007/03/15 14:51:00 christos Exp $")
00053 #endif  /* lint */
00054 
00055 typedef unsigned long unichar;
00056 
00057 #define MAXLINELEN 300  /* longest sane line length */
00058 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00059                   || (x) == 0x85 || (x) == '\f')
00060 
00061 private int looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00062     size_t *ulen)
00063         /*@modifies ubuf, *ulen @*/;
00064 private int looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00065     size_t *ulen)
00066         /*@modifies ubuf, *ulen @*/;
00067 private int looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00068     size_t *ulen)
00069         /*@modifies ubuf, *ulen @*/;
00070 private int looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00071     size_t *ulen)
00072         /*@modifies ubuf, *ulen @*/;
00073 private int looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00074     size_t *ulen)
00075         /*@modifies ubuf, *ulen @*/;
00076 private void from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00077         /*@modifies *out @*/;
00078 private int ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00079         /*@*/;
00080 
00081 
00082 protected int
00083 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
00084 {
00085         size_t i;
00086         unsigned char *nbuf = NULL;
00087         unichar *ubuf = NULL;   
00088         size_t ulen;
00089         struct names *p;
00090         int rv = -1;
00091 
00092         const char *code = NULL;
00093         const char *code_mime = NULL;
00094         const char *type = NULL;
00095         const char *subtype = NULL;
00096         const char *subtype_mime = NULL;
00097 
00098         int has_escapes = 0;
00099         int has_backspace = 0;
00100         int seen_cr = 0;
00101 
00102         int n_crlf = 0;
00103         int n_lf = 0;
00104         int n_cr = 0;
00105         int n_nel = 0;
00106 
00107         size_t last_line_end = (size_t)-1;
00108         int has_long_lines = 0;
00109 
00110         /*
00111          * Undo the NUL-termination kindly provided by process()
00112          * but leave at least one byte to look at
00113          */
00114         while (nbytes > 1 && buf[nbytes - 1] == '\0')
00115                 nbytes--;
00116 
00117         if ((nbuf = calloc(1, (nbytes + 1) * sizeof(nbuf[0]))) == NULL)
00118                 goto done;
00119         if ((ubuf = calloc(1, (nbytes + 1) * sizeof(ubuf[0]))) == NULL)
00120                 goto done;
00121 
00122         /*
00123          * Then try to determine whether it's any character code we can
00124          * identify.  Each of these tests, if it succeeds, will leave
00125          * the text converted into one-unichar-per-character Unicode in
00126          * ubuf, and the number of characters converted in ulen.
00127          */
00128         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
00129                 code = "ASCII";
00130                 code_mime = "us-ascii";
00131                 type = "text";
00132         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
00133                 code = "UTF-8 Unicode";
00134                 code_mime = "utf-8";
00135                 type = "text";
00136         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
00137                 if (i == 1)
00138                         code = "Little-endian UTF-16 Unicode";
00139                 else
00140                         code = "Big-endian UTF-16 Unicode";
00141 
00142                 type = "character data";
00143                 code_mime = "utf-16";    /* is this defined? */
00144         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
00145                 code = "ISO-8859";
00146                 type = "text";
00147                 code_mime = "iso-8859-1"; 
00148         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
00149                 code = "Non-ISO extended-ASCII";
00150                 type = "text";
00151                 code_mime = "unknown";
00152         } else {
00153                 from_ebcdic(buf, nbytes, nbuf);
00154 
00155                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
00156                         code = "EBCDIC";
00157                         type = "character data";
00158                         code_mime = "ebcdic";
00159                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
00160                         code = "International EBCDIC";
00161                         type = "character data";
00162                         code_mime = "ebcdic";
00163                 } else {
00164                         rv = 0;
00165                         goto done;  /* doesn't look like text at all */
00166                 }
00167         }
00168 
00169         if (nbytes <= 1) {
00170                 rv = 0;
00171                 goto done;
00172         }
00173 
00174         /*
00175          * for troff, look for . + letter + letter or .\";
00176          * this must be done to disambiguate tar archives' ./file
00177          * and other trash from real troff input.
00178          *
00179          * I believe Plan 9 troff allows non-ASCII characters in the names
00180          * of macros, so this test might possibly fail on such a file.
00181          */
00182         if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') {
00183                 unichar *tp = ubuf + 1;
00184 
00185                 while (ISSPC(*tp))
00186                         ++tp;   /* skip leading whitespace */
00187                 if ((tp[0] == '\\' && tp[1] == '\"') ||
00188                     (isascii((unsigned char)tp[0]) &&
00189                      isalnum((unsigned char)tp[0]) &&
00190                      isascii((unsigned char)tp[1]) &&
00191                      isalnum((unsigned char)tp[1]) &&
00192                      ISSPC(tp[2]))) {
00193                         subtype_mime = "text/troff";
00194                         subtype = "troff or preprocessor input";
00195                         goto subtype_identified;
00196                 }
00197         }
00198 
00199         if ((ms->flags & MAGIC_NO_CHECK_FORTRAN) == 0 &&
00200             (*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00201                 subtype_mime = "text/fortran";
00202                 subtype = "fortran program";
00203                 goto subtype_identified;
00204         }
00205 
00206         /* look for tokens from names.h - this is expensive! */
00207 
00208         if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
00209                 goto subtype_identified;
00210 
00211         i = 0;
00212         while (i < ulen) {
00213                 size_t end;
00214 
00215                 /*
00216                  * skip past any leading space
00217                  */
00218                 while (i < ulen && ISSPC(ubuf[i]))
00219                         i++;
00220                 if (i >= ulen)
00221                         break;
00222 
00223                 /*
00224                  * find the next whitespace
00225                  */
00226                 for (end = i + 1; end < nbytes; end++)
00227                         if (ISSPC(ubuf[end]))
00228                                 /*@innerbreak@*/ break;
00229 
00230                 /*
00231                  * compare the word thus isolated against the token list
00232                  */
00233                 for (p = names; p < names + NNAMES; p++) {
00234                         if (ascmatch((const unsigned char *)p->name, ubuf + i,
00235                             end - i)) {
00236                                 subtype = types[p->type].human;
00237                                 subtype_mime = types[p->type].mime;
00238                                 goto subtype_identified;
00239                         }
00240                 }
00241 
00242                 i = end;
00243         }
00244 
00245 subtype_identified:
00246 
00247         /*
00248          * Now try to discover other details about the file.
00249          */
00250         for (i = 0; i < ulen; i++) {
00251                 if (ubuf[i] == '\n') {
00252                         if (seen_cr)
00253                                 n_crlf++;
00254                         else
00255                                 n_lf++;
00256                         last_line_end = i;
00257                 } else if (seen_cr)
00258                         n_cr++;
00259 
00260                 seen_cr = (ubuf[i] == '\r');
00261                 if (seen_cr)
00262                         last_line_end = i;
00263 
00264                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
00265                         n_nel++;
00266                         last_line_end = i;
00267                 }
00268 
00269                 /* If this line is _longer_ than MAXLINELEN, remember it. */
00270                 if (i > last_line_end + MAXLINELEN)
00271                         has_long_lines = 1;
00272 
00273                 if (ubuf[i] == '\033')
00274                         has_escapes = 1;
00275                 if (ubuf[i] == '\b')
00276                         has_backspace = 1;
00277         }
00278 
00279         /* Beware, if the data has been truncated, the final CR could have
00280            been followed by a LF.  If we have HOWMANY bytes, it indicates
00281            that the data might have been truncated, probably even before
00282            this function was called. */
00283         if (seen_cr && nbytes < HOWMANY)
00284                 n_cr++;
00285 
00286         if ((ms->flags & MAGIC_MIME)) {
00287                 if (subtype_mime) {
00288                         if (file_printf(ms, subtype_mime) == -1)
00289                                 goto done;
00290                 } else {
00291                         if (file_printf(ms, "text/plain") == -1)
00292                                 goto done;
00293                 }
00294 
00295                 if (code_mime) {
00296                         if (file_printf(ms, "; charset=") == -1)
00297                                 goto done;
00298                         if (file_printf(ms, code_mime) == -1)
00299                                 goto done;
00300                 }
00301         } else {
00302                 if (file_printf(ms, code) == -1)
00303                         goto done;
00304 
00305                 if (subtype) {
00306                         if (file_printf(ms, " ") == -1)
00307                                 goto done;
00308                         if (file_printf(ms, subtype) == -1)
00309                                 goto done;
00310                 }
00311 
00312                 if (file_printf(ms, " ") == -1)
00313                         goto done;
00314                 if (file_printf(ms, type) == -1)
00315                         goto done;
00316 
00317                 if (has_long_lines)
00318                         if (file_printf(ms, ", with very long lines") == -1)
00319                                 goto done;
00320 
00321                 /*
00322                  * Only report line terminators if we find one other than LF,
00323                  * or if we find none at all.
00324                  */
00325                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00326                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00327                         if (file_printf(ms, ", with") == -1)
00328                                 goto done;
00329 
00330                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)                        {
00331                                 if (file_printf(ms, " no") == -1)
00332                                         goto done;
00333                         } else {
00334                                 if (n_crlf) {
00335                                         if (file_printf(ms, " CRLF") == -1)
00336                                                 goto done;
00337                                         if (n_cr || n_lf || n_nel)
00338                                                 if (file_printf(ms, ",") == -1)
00339                                                         goto done;
00340                                 }
00341                                 if (n_cr) {
00342                                         if (file_printf(ms, " CR") == -1)
00343                                                 goto done;
00344                                         if (n_lf || n_nel)
00345                                                 if (file_printf(ms, ",") == -1)
00346                                                         goto done;
00347                                 }
00348                                 if (n_lf) {
00349                                         if (file_printf(ms, " LF") == -1)
00350                                                 goto done;
00351                                         if (n_nel)
00352                                                 if (file_printf(ms, ",") == -1)
00353                                                         goto done;
00354                                 }
00355                                 if (n_nel)
00356                                         if (file_printf(ms, " NEL") == -1)
00357                                                 goto done;
00358                         }
00359 
00360                         if (file_printf(ms, " line terminators") == -1)
00361                                 goto done;
00362                 }
00363 
00364                 if (has_escapes)
00365                         if (file_printf(ms, ", with escape sequences") == -1)
00366                                 goto done;
00367                 if (has_backspace)
00368                         if (file_printf(ms, ", with overstriking") == -1)
00369                                 goto done;
00370         }
00371         rv = 1;
00372 done:
00373         if (nbuf)
00374                 free(nbuf);
00375         if (ubuf)
00376                 free(ubuf);
00377 
00378         return rv;
00379 }
00380 
00381 private int
00382 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00383 {
00384         size_t i;
00385 
00386         for (i = 0; i < ulen; i++) {
00387                 if (s[i] != us[i])
00388                         return 0;
00389         }
00390 
00391         if (s[i])
00392                 return 0;
00393         else
00394                 return 1;
00395 }
00396 
00397 /*
00398  * This table reflects a particular philosophy about what constitutes
00399  * "text," and there is room for disagreement about it.
00400  *
00401  * Version 3.31 of the file command considered a file to be ASCII if
00402  * each of its characters was approved by either the isascii() or
00403  * isalpha() function.  On most systems, this would mean that any
00404  * file consisting only of characters in the range 0x00 ... 0x7F
00405  * would be called ASCII text, but many systems might reasonably
00406  * consider some characters outside this range to be alphabetic,
00407  * so the file command would call such characters ASCII.  It might
00408  * have been more accurate to call this "considered textual on the
00409  * local system" than "ASCII."
00410  *
00411  * It considered a file to be "International language text" if each
00412  * of its characters was either an ASCII printing character (according
00413  * to the real ASCII standard, not the above test), a character in
00414  * the range 0x80 ... 0xFF, or one of the following control characters:
00415  * backspace, tab, line feed, vertical tab, form feed, carriage return,
00416  * escape.  No attempt was made to determine the language in which files
00417  * of this type were written.
00418  *
00419  *
00420  * The table below considers a file to be ASCII if all of its characters
00421  * are either ASCII printing characters (again, according to the X3.4
00422  * standard, not isascii()) or any of the following controls: bell,
00423  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
00424  *
00425  * I include bell because some programs (particularly shell scripts)
00426  * use it literally, even though it is rare in normal text.  I exclude
00427  * vertical tab because it never seems to be used in real text.  I also
00428  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
00429  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
00430  * character to.  It might be more appropriate to include it in the 8859
00431  * set instead of the ASCII set, but it's got to be included in *something*
00432  * we recognize or EBCDIC files aren't going to be considered textual.
00433  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
00434  * and Latin characters, so these should possibly be allowed.  But they
00435  * make a real mess on VT100-style displays if they're not paired properly,
00436  * so we are probably better off not calling them text.
00437  *
00438  * A file is considered to be ISO-8859 text if its characters are all
00439  * either ASCII, according to the above definition, or printing characters
00440  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
00441  *
00442  * Finally, a file is considered to be international text from some other
00443  * character code if its characters are all either ISO-8859 (according to
00444  * the above definition) or characters in the range 0x80 ... 0x9F, which
00445  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
00446  * consider to be printing characters.
00447  */
00448 
00449 #define F 0   /* character never appears in text */
00450 #define T 1   /* character appears in plain ASCII text */
00451 #define I 2   /* character appears in ISO-8859 text */
00452 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
00453 
00454 /*@unchecked@*/ /*@observer@*/
00455 private char text_chars[256] = {
00456         /*                  BEL BS HT LF    FF CR    */
00457         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
00458         /*                              ESC          */
00459         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
00460         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
00461         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
00462         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
00463         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
00464         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
00465         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
00466         /*            NEL                            */
00467         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
00468         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
00469         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
00470         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
00471         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
00472         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
00473         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
00474         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
00475 };
00476 
00477 private int
00478 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00479     size_t *ulen)
00480 {
00481         size_t i;
00482 
00483         *ulen = 0;
00484 
00485         for (i = 0; i < nbytes; i++) {
00486                 int t = text_chars[buf[i]];
00487 
00488                 if (t != T)
00489                         return 0;
00490 
00491                 ubuf[(*ulen)++] = buf[i];
00492         }
00493 
00494         return 1;
00495 }
00496 
00497 private int
00498 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00499 {
00500         size_t i;
00501 
00502         *ulen = 0;
00503 
00504         for (i = 0; i < nbytes; i++) {
00505                 int t = text_chars[buf[i]];
00506 
00507                 if (t != T && t != I)
00508                         return 0;
00509 
00510                 ubuf[(*ulen)++] = buf[i];
00511         }
00512 
00513         return 1;
00514 }
00515 
00516 private int
00517 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00518     size_t *ulen)
00519 {
00520         size_t i;
00521 
00522         *ulen = 0;
00523 
00524         for (i = 0; i < nbytes; i++) {
00525                 int t = text_chars[buf[i]];
00526 
00527                 if (t != T && t != I && t != X)
00528                         return 0;
00529 
00530                 ubuf[(*ulen)++] = buf[i];
00531         }
00532 
00533         return 1;
00534 }
00535 
00536 private int
00537 looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00538 {
00539         size_t i;
00540         int n;
00541         unichar c;
00542         int gotone = 0;
00543 
00544         *ulen = 0;
00545 
00546         for (i = 0; i < nbytes; i++) {
00547                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
00548                         /*
00549                          * Even if the whole file is valid UTF-8 sequences,
00550                          * still reject it if it uses weird control characters.
00551                          */
00552 
00553                         if (text_chars[buf[i]] != T)
00554                                 return 0;
00555 
00556                         ubuf[(*ulen)++] = buf[i];
00557                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
00558                         return 0;
00559                 } else {                           /* 11xxxxxx begins UTF-8 */
00560                         int following;
00561 
00562                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
00563                                 c = buf[i] & 0x1f;
00564                                 following = 1;
00565                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
00566                                 c = buf[i] & 0x0f;
00567                                 following = 2;
00568                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
00569                                 c = buf[i] & 0x07;
00570                                 following = 3;
00571                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
00572                                 c = buf[i] & 0x03;
00573                                 following = 4;
00574                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
00575                                 c = buf[i] & 0x01;
00576                                 following = 5;
00577                         } else
00578                                 return 0;
00579 
00580                         for (n = 0; n < following; n++) {
00581                                 i++;
00582                                 if (i >= nbytes)
00583                                         goto done;
00584 
00585                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00586                                         return 0;
00587 
00588                                 c = (c << 6) + (buf[i] & 0x3f);
00589                         }
00590 
00591                         ubuf[(*ulen)++] = c;
00592                         gotone = 1;
00593                 }
00594         }
00595 done:
00596         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
00597 }
00598 
00599 private int
00600 looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00601     size_t *ulen)
00602 {
00603         int bigend;
00604         size_t i;
00605 
00606         if (nbytes < 2)
00607                 return 0;
00608 
00609         if (buf[0] == 0xff && buf[1] == 0xfe)
00610                 bigend = 0;
00611         else if (buf[0] == 0xfe && buf[1] == 0xff)
00612                 bigend = 1;
00613         else
00614                 return 0;
00615 
00616         *ulen = 0;
00617 
00618         for (i = 2; i + 1 < nbytes; i += 2) {
00619                 /* XXX fix to properly handle chars > 65536 */
00620 
00621                 if (bigend)
00622                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00623                 else
00624                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00625 
00626                 if (ubuf[*ulen - 1] == 0xfffe)
00627                         return 0;
00628                 if (ubuf[*ulen - 1] < 128 &&
00629                     text_chars[(size_t)ubuf[*ulen - 1]] != T)
00630                         return 0;
00631         }
00632 
00633         return 1 + bigend;
00634 }
00635 
00636 #undef F
00637 #undef T
00638 #undef I
00639 #undef X
00640 
00641 /*
00642  * This table maps each EBCDIC character to an (8-bit extended) ASCII
00643  * character, as specified in the rationale for the dd(1) command in
00644  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
00645  *
00646  * Unfortunately it does not seem to correspond exactly to any of the
00647  * five variants of EBCDIC documented in IBM's _Enterprise Systems
00648  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
00649  * Edition, July, 1999, pp. I-1 - I-4.
00650  *
00651  * Fortunately, though, all versions of EBCDIC, including this one, agree
00652  * on most of the printing characters that also appear in (7-bit) ASCII.
00653  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
00654  *
00655  * Fortunately too, there is general agreement that codes 0x00 through
00656  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
00657  * remainder printing characters.
00658  *
00659  * This is sufficient to allow us to identify EBCDIC text and to distinguish
00660  * between old-style and internationalized examples of text.
00661  */
00662 
00663 /*@unchecked@*/ /*@observer@*/
00664 private unsigned char ebcdic_to_ascii[] = {
00665   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
00666  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
00667 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
00668 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
00669 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00670 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00671 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00672 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00673 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00674 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00675 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00676 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00677 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00678 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00679 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00680 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00681 };
00682 
00683 #ifdef notdef
00684 /*
00685  * The following EBCDIC-to-ASCII table may relate more closely to reality,
00686  * or at least to modern reality.  It comes from
00687  *
00688  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
00689  *
00690  * and maps the characters of EBCDIC code page 1047 (the code used for
00691  * Unix-derived software on IBM's 390 systems) to the corresponding
00692  * characters from ISO 8859-1.
00693  *
00694  * If this table is used instead of the above one, some of the special
00695  * cases for the NEL character can be taken out of the code.
00696  */
00697 
00698 private unsigned char ebcdic_1047_to_8859[] = {
00699 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00700 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00701 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00702 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00703 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00704 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00705 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00706 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00707 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00708 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00709 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00710 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00711 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00712 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00713 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00714 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00715 };
00716 #endif
00717 
00718 /*
00719  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
00720  */
00721 private void
00722 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00723 {
00724         size_t i;
00725 
00726         for (i = 0; i < nbytes; i++) {
00727                 out[i] = ebcdic_to_ascii[buf[i]];
00728         }
00729 }

Generated on Fri Sep 7 01:27:53 2007 for rpm by  doxygen 1.5.1