00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "file.h"
00040 #include "magic.h"
00041 #include <stdio.h>
00042 #include <string.h>
00043 #include <memory.h>
00044 #include <ctype.h>
00045 #include <stdlib.h>
00046 #ifdef HAVE_UNISTD_H
00047 #include <unistd.h>
00048 #endif
00049 #include "names.h"
00050
00051 #ifndef lint
00052 FILE_RCSID("@(#)$File: ascmagic.c,v 1.50 2007/03/15 14:51:00 christos Exp $")
00053 #endif
00054
00055 typedef unsigned long unichar;
00056
00057 #define MAXLINELEN 300
00058 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00059 || (x) == 0x85 || (x) == '\f')
00060
00061 private int looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00062 size_t *ulen)
00063 ;
00064 private int looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00065 size_t *ulen)
00066 ;
00067 private int looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00068 size_t *ulen)
00069 ;
00070 private int looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00071 size_t *ulen)
00072 ;
00073 private int looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00074 size_t *ulen)
00075 ;
00076 private void from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00077 ;
00078 private int ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00079 ;
00080
00081
00082 protected int
00083 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
00084 {
00085 size_t i;
00086 unsigned char *nbuf = NULL;
00087 unichar *ubuf = NULL;
00088 size_t ulen;
00089 struct names *p;
00090 int rv = -1;
00091
00092 const char *code = NULL;
00093 const char *code_mime = NULL;
00094 const char *type = NULL;
00095 const char *subtype = NULL;
00096 const char *subtype_mime = NULL;
00097
00098 int has_escapes = 0;
00099 int has_backspace = 0;
00100 int seen_cr = 0;
00101
00102 int n_crlf = 0;
00103 int n_lf = 0;
00104 int n_cr = 0;
00105 int n_nel = 0;
00106
00107 size_t last_line_end = (size_t)-1;
00108 int has_long_lines = 0;
00109
00110
00111
00112
00113
00114 while (nbytes > 1 && buf[nbytes - 1] == '\0')
00115 nbytes--;
00116
00117 if ((nbuf = calloc(1, (nbytes + 1) * sizeof(nbuf[0]))) == NULL)
00118 goto done;
00119 if ((ubuf = calloc(1, (nbytes + 1) * sizeof(ubuf[0]))) == NULL)
00120 goto done;
00121
00122
00123
00124
00125
00126
00127
00128 if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
00129 code = "ASCII";
00130 code_mime = "us-ascii";
00131 type = "text";
00132 } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
00133 code = "UTF-8 Unicode";
00134 code_mime = "utf-8";
00135 type = "text";
00136 } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
00137 if (i == 1)
00138 code = "Little-endian UTF-16 Unicode";
00139 else
00140 code = "Big-endian UTF-16 Unicode";
00141
00142 type = "character data";
00143 code_mime = "utf-16";
00144 } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
00145 code = "ISO-8859";
00146 type = "text";
00147 code_mime = "iso-8859-1";
00148 } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
00149 code = "Non-ISO extended-ASCII";
00150 type = "text";
00151 code_mime = "unknown";
00152 } else {
00153 from_ebcdic(buf, nbytes, nbuf);
00154
00155 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
00156 code = "EBCDIC";
00157 type = "character data";
00158 code_mime = "ebcdic";
00159 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
00160 code = "International EBCDIC";
00161 type = "character data";
00162 code_mime = "ebcdic";
00163 } else {
00164 rv = 0;
00165 goto done;
00166 }
00167 }
00168
00169 if (nbytes <= 1) {
00170 rv = 0;
00171 goto done;
00172 }
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182 if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') {
00183 unichar *tp = ubuf + 1;
00184
00185 while (ISSPC(*tp))
00186 ++tp;
00187 if ((tp[0] == '\\' && tp[1] == '\"') ||
00188 (isascii((unsigned char)tp[0]) &&
00189 isalnum((unsigned char)tp[0]) &&
00190 isascii((unsigned char)tp[1]) &&
00191 isalnum((unsigned char)tp[1]) &&
00192 ISSPC(tp[2]))) {
00193 subtype_mime = "text/troff";
00194 subtype = "troff or preprocessor input";
00195 goto subtype_identified;
00196 }
00197 }
00198
00199 if ((ms->flags & MAGIC_NO_CHECK_FORTRAN) == 0 &&
00200 (*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00201 subtype_mime = "text/fortran";
00202 subtype = "fortran program";
00203 goto subtype_identified;
00204 }
00205
00206
00207
00208 if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
00209 goto subtype_identified;
00210
00211 i = 0;
00212 while (i < ulen) {
00213 size_t end;
00214
00215
00216
00217
00218 while (i < ulen && ISSPC(ubuf[i]))
00219 i++;
00220 if (i >= ulen)
00221 break;
00222
00223
00224
00225
00226 for (end = i + 1; end < nbytes; end++)
00227 if (ISSPC(ubuf[end]))
00228 break;
00229
00230
00231
00232
00233 for (p = names; p < names + NNAMES; p++) {
00234 if (ascmatch((const unsigned char *)p->name, ubuf + i,
00235 end - i)) {
00236 subtype = types[p->type].human;
00237 subtype_mime = types[p->type].mime;
00238 goto subtype_identified;
00239 }
00240 }
00241
00242 i = end;
00243 }
00244
00245 subtype_identified:
00246
00247
00248
00249
00250 for (i = 0; i < ulen; i++) {
00251 if (ubuf[i] == '\n') {
00252 if (seen_cr)
00253 n_crlf++;
00254 else
00255 n_lf++;
00256 last_line_end = i;
00257 } else if (seen_cr)
00258 n_cr++;
00259
00260 seen_cr = (ubuf[i] == '\r');
00261 if (seen_cr)
00262 last_line_end = i;
00263
00264 if (ubuf[i] == 0x85) {
00265 n_nel++;
00266 last_line_end = i;
00267 }
00268
00269
00270 if (i > last_line_end + MAXLINELEN)
00271 has_long_lines = 1;
00272
00273 if (ubuf[i] == '\033')
00274 has_escapes = 1;
00275 if (ubuf[i] == '\b')
00276 has_backspace = 1;
00277 }
00278
00279
00280
00281
00282
00283 if (seen_cr && nbytes < HOWMANY)
00284 n_cr++;
00285
00286 if ((ms->flags & MAGIC_MIME)) {
00287 if (subtype_mime) {
00288 if (file_printf(ms, subtype_mime) == -1)
00289 goto done;
00290 } else {
00291 if (file_printf(ms, "text/plain") == -1)
00292 goto done;
00293 }
00294
00295 if (code_mime) {
00296 if (file_printf(ms, "; charset=") == -1)
00297 goto done;
00298 if (file_printf(ms, code_mime) == -1)
00299 goto done;
00300 }
00301 } else {
00302 if (file_printf(ms, code) == -1)
00303 goto done;
00304
00305 if (subtype) {
00306 if (file_printf(ms, " ") == -1)
00307 goto done;
00308 if (file_printf(ms, subtype) == -1)
00309 goto done;
00310 }
00311
00312 if (file_printf(ms, " ") == -1)
00313 goto done;
00314 if (file_printf(ms, type) == -1)
00315 goto done;
00316
00317 if (has_long_lines)
00318 if (file_printf(ms, ", with very long lines") == -1)
00319 goto done;
00320
00321
00322
00323
00324
00325 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00326 (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00327 if (file_printf(ms, ", with") == -1)
00328 goto done;
00329
00330 if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
00331 if (file_printf(ms, " no") == -1)
00332 goto done;
00333 } else {
00334 if (n_crlf) {
00335 if (file_printf(ms, " CRLF") == -1)
00336 goto done;
00337 if (n_cr || n_lf || n_nel)
00338 if (file_printf(ms, ",") == -1)
00339 goto done;
00340 }
00341 if (n_cr) {
00342 if (file_printf(ms, " CR") == -1)
00343 goto done;
00344 if (n_lf || n_nel)
00345 if (file_printf(ms, ",") == -1)
00346 goto done;
00347 }
00348 if (n_lf) {
00349 if (file_printf(ms, " LF") == -1)
00350 goto done;
00351 if (n_nel)
00352 if (file_printf(ms, ",") == -1)
00353 goto done;
00354 }
00355 if (n_nel)
00356 if (file_printf(ms, " NEL") == -1)
00357 goto done;
00358 }
00359
00360 if (file_printf(ms, " line terminators") == -1)
00361 goto done;
00362 }
00363
00364 if (has_escapes)
00365 if (file_printf(ms, ", with escape sequences") == -1)
00366 goto done;
00367 if (has_backspace)
00368 if (file_printf(ms, ", with overstriking") == -1)
00369 goto done;
00370 }
00371 rv = 1;
00372 done:
00373 if (nbuf)
00374 free(nbuf);
00375 if (ubuf)
00376 free(ubuf);
00377
00378 return rv;
00379 }
00380
00381 private int
00382 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00383 {
00384 size_t i;
00385
00386 for (i = 0; i < ulen; i++) {
00387 if (s[i] != us[i])
00388 return 0;
00389 }
00390
00391 if (s[i])
00392 return 0;
00393 else
00394 return 1;
00395 }
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449 #define F 0
00450 #define T 1
00451 #define I 2
00452 #define X 3
00453
00454
00455 private char text_chars[256] = {
00456
00457 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,
00458
00459 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,
00460 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00461 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00462 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00463 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00464 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
00465 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,
00466
00467 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,
00468 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,
00469 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00470 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00471 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00472 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00473 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
00474 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I
00475 };
00476
00477 private int
00478 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00479 size_t *ulen)
00480 {
00481 size_t i;
00482
00483 *ulen = 0;
00484
00485 for (i = 0; i < nbytes; i++) {
00486 int t = text_chars[buf[i]];
00487
00488 if (t != T)
00489 return 0;
00490
00491 ubuf[(*ulen)++] = buf[i];
00492 }
00493
00494 return 1;
00495 }
00496
00497 private int
00498 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00499 {
00500 size_t i;
00501
00502 *ulen = 0;
00503
00504 for (i = 0; i < nbytes; i++) {
00505 int t = text_chars[buf[i]];
00506
00507 if (t != T && t != I)
00508 return 0;
00509
00510 ubuf[(*ulen)++] = buf[i];
00511 }
00512
00513 return 1;
00514 }
00515
00516 private int
00517 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00518 size_t *ulen)
00519 {
00520 size_t i;
00521
00522 *ulen = 0;
00523
00524 for (i = 0; i < nbytes; i++) {
00525 int t = text_chars[buf[i]];
00526
00527 if (t != T && t != I && t != X)
00528 return 0;
00529
00530 ubuf[(*ulen)++] = buf[i];
00531 }
00532
00533 return 1;
00534 }
00535
00536 private int
00537 looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00538 {
00539 size_t i;
00540 int n;
00541 unichar c;
00542 int gotone = 0;
00543
00544 *ulen = 0;
00545
00546 for (i = 0; i < nbytes; i++) {
00547 if ((buf[i] & 0x80) == 0) {
00548
00549
00550
00551
00552
00553 if (text_chars[buf[i]] != T)
00554 return 0;
00555
00556 ubuf[(*ulen)++] = buf[i];
00557 } else if ((buf[i] & 0x40) == 0) {
00558 return 0;
00559 } else {
00560 int following;
00561
00562 if ((buf[i] & 0x20) == 0) {
00563 c = buf[i] & 0x1f;
00564 following = 1;
00565 } else if ((buf[i] & 0x10) == 0) {
00566 c = buf[i] & 0x0f;
00567 following = 2;
00568 } else if ((buf[i] & 0x08) == 0) {
00569 c = buf[i] & 0x07;
00570 following = 3;
00571 } else if ((buf[i] & 0x04) == 0) {
00572 c = buf[i] & 0x03;
00573 following = 4;
00574 } else if ((buf[i] & 0x02) == 0) {
00575 c = buf[i] & 0x01;
00576 following = 5;
00577 } else
00578 return 0;
00579
00580 for (n = 0; n < following; n++) {
00581 i++;
00582 if (i >= nbytes)
00583 goto done;
00584
00585 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00586 return 0;
00587
00588 c = (c << 6) + (buf[i] & 0x3f);
00589 }
00590
00591 ubuf[(*ulen)++] = c;
00592 gotone = 1;
00593 }
00594 }
00595 done:
00596 return gotone;
00597 }
00598
00599 private int
00600 looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00601 size_t *ulen)
00602 {
00603 int bigend;
00604 size_t i;
00605
00606 if (nbytes < 2)
00607 return 0;
00608
00609 if (buf[0] == 0xff && buf[1] == 0xfe)
00610 bigend = 0;
00611 else if (buf[0] == 0xfe && buf[1] == 0xff)
00612 bigend = 1;
00613 else
00614 return 0;
00615
00616 *ulen = 0;
00617
00618 for (i = 2; i + 1 < nbytes; i += 2) {
00619
00620
00621 if (bigend)
00622 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00623 else
00624 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00625
00626 if (ubuf[*ulen - 1] == 0xfffe)
00627 return 0;
00628 if (ubuf[*ulen - 1] < 128 &&
00629 text_chars[(size_t)ubuf[*ulen - 1]] != T)
00630 return 0;
00631 }
00632
00633 return 1 + bigend;
00634 }
00635
00636 #undef F
00637 #undef T
00638 #undef I
00639 #undef X
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664 private unsigned char ebcdic_to_ascii[] = {
00665 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
00666 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
00667 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
00668 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
00669 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00670 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00671 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00672 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00673 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00674 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00675 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00676 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00677 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00678 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00679 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00680 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00681 };
00682
00683 #ifdef notdef
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694
00695
00696
00697
00698 private unsigned char ebcdic_1047_to_8859[] = {
00699 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00700 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00701 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00702 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00703 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00704 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00705 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00706 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00707 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00708 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00709 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00710 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00711 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00712 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00713 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00714 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00715 };
00716 #endif
00717
00718
00719
00720
00721 private void
00722 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00723 {
00724 size_t i;
00725
00726 for (i = 0; i < nbytes; i++) {
00727 out[i] = ebcdic_to_ascii[buf[i]];
00728 }
00729 }