1: 
  2: // Copyright (c) 1999-2009 by Digital Mars
  3: // All Rights Reserved
  4: // written by Walter Bright
  5: // http://www.digitalmars.com
  6: // License for redistribution is by either the Artistic License
  7: // in artistic.txt, or the GNU General Public License in gpl.txt.
  8: // See the included readme.txt for details.
  9: 
 10: 
 11: /* HTML parser
 12:  */
 13: 
 14: #include <stdio.h>
 15: #include <string.h>
 16: #include <ctype.h>
 17: #include <stdarg.h>
 18: #include <errno.h>
 19: #include <wchar.h>
 20: 
 21: #include "html.h"
 22: 
 23: #if MARS
 24: static char __file__[] = __FILE__;      /* for tassert.h                */
 25: #include        "tassert.h"
 26: #include "root.h"
 27: //#include "../mars/mars.h"
 28: #else
 29: #include "outbuf.h"
 30: #include "msgs2.h"
 31: 
 32: extern void html_err(const char *, unsigned, unsigned, ...);
 33: 
 34: static char __file__[] = __FILE__;      /* for tassert.h                */
 35: #include        "tassert.h"
 36: #endif
 37: 
 38: #if __GNUC__
 39: int memicmp(const char *s1, const char *s2, int n);
 40: #if 0
 41: {
 42:     int result = 0;
 43: 
 44:     for (int i = 0; i < n; i++)
 45:     {   char c1 = s1[i];
 46:         char c2 = s2[i];
 47: 
 48:         result = c1 - c2;
 49:         if (result)
 50:         {
 51:             if ('A' <= c1 && c1 <= 'Z')
 52:                 c1 += 'a' - 'A';
 53:             if ('A' <= c2 && c2 <= 'Z')
 54:                 c2 += 'a' - 'A';
 55:             result = c1 - c2;
 56:             if (result)
 57:                 break;
 58:         }
 59:     }
 60:     return result;
 61: }
 62: #endif
 63: #endif
 64: 
 65: extern int HtmlNamedEntity(unsigned char *p, int length);
 66: 
 67: static int isLineSeparator(const unsigned char* p);
 68: 
 69: /**********************************
 70:  * Determine if beginning of tag identifier
 71:  * or a continuation of a tag identifier.
 72:  */
 73: 
 74: inline int istagstart(int c)
 75: {
 76:     return (isalpha(c) || c == '_');
 77: }
 78: 
 79: inline int istag(int c)
 80: {
 81:     return (isalnum(c) || c == '_');
 82: }
 83: 
 84: /**********************************************
 85:  */
 86: 
 87: Html::Html(const char *sourcename, unsigned char *base, unsigned length)
 88: {
 89:     //printf("Html::Html()\n");
 90:     this->sourcename = sourcename;
 91:     this->base = base;
 92:     p = base;
 93:     end = base + length;
 94:     linnum = 1;
 95:     dbuf = NULL;
 96:     inCode = 0;
 97: }
 98: 
 99: /**********************************************
100:  * Print error & quit.
101:  */
102: 
103: void Html::error(const char *format, ...)
104: {
105:     printf("%s(%d) : HTML Error: ", sourcename, linnum);
106: 
107:     va_list ap;
108:     va_start(ap, format);
109:     vprintf(format, ap);
110:     va_end(ap);
111: 
112:     printf("\n");
113:     fflush(stdout);
114: 
115: //#if MARS
116: //    global.errors++;
117: //#else
118:     exit(EXIT_FAILURE);
119: //#endif
120: }
121: 
122: /**********************************************
123:  * Extract all the code from an HTML file,
124:  * concatenate it all together, and store in buf.
125:  */
126: 
127: #if MARS
128: void Html::extractCode(OutBuffer *buf)
129: #else
130: void Html::extractCode(Outbuffer *buf)
131: #endif
132: {
133:     //printf("Html::extractCode()\n");
134:     dbuf = buf;                 // save for other routines
135:     buf->reserve(end - p);
136:     inCode = 0;
137:     while (1)
138:     {
139:         //printf("p = %p, *p = x%x\n", p, *p);
140:         switch (*p)
141:         {
142: #if 0 // strings are not recognized outside of tags
143:             case '"':
144:             case '\'':
145:                 skipString();
146:                 continue;
147: #endif
148:             case '<':
149:                 if (p[1] == '!' && isCommentStart())
150:                 {   // Comments start with <!--
151:                     scanComment();
152:                 }
153:                 else if(p[1] == '!' && isCDATAStart())
154:                 {
155:                     scanCDATA();
156:                 }
157:                 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
158:                     skipTag();
159:                 else if (istagstart(*skipWhite(p + 1)))
160:                     skipTag();
161:                 else
162:                     goto Ldefault;
163:                 continue;
164: 
165:             case 0:
166:             case 0x1a:
167:                 break;          // end of file
168: 
169:             case '&':
170:                 if (inCode)
171:                 {   // Translate character entity into ascii for D parser
172:                     int c;
173: 
174:                     c = charEntity();
175: #if MARS
176:                     buf->writeUTF8(c);
177: #else
178:                     buf->writeByte(c);
179: #endif
180:                 }
181:                 else
182:                     p++;
183:                 continue;
184: 
185:             case '\r':
186:                 if (p[1] == '\n')
187:                     goto Ldefault;
188:             case '\n':
189:                 linnum++;
190:                 // Always extract new lines, so that D lexer counts the
191:                 // lines right.
192:                 buf->writeByte(*p);
193:                 p++;
194:                 continue;
195: 
196:             default:
197:             Ldefault:
198:                 if (inCode)
199:                     buf->writeByte(*p);
200:                 p++;
201:                 continue;
202:         }
203:         break;
204:     }
205:     buf->writeByte(0);                          // ending sentinel
206: #if SCPP
207:     //printf("Code is: '%s'\n", buf->toString() + 3);
208: #endif
209: #if MARS
210:     //printf("D code is: '%s'\n", (char *)buf->data);
211: #endif
212: }
213: 
214: /***********************************************
215:  * Scan to end of <> tag.
216:  * Look for <code> and </code> tags to start/stop D processing.
217:  * Input:
218:  *      p is on opening '<' of tag; it's already verified that
219:  *      it's a tag by lookahead
220:  * Output:
221:  *      p is past closing '>' of tag
222:  */
223: 
224: void Html::skipTag()
225: {
226:     enum TagState       // what parsing state we're in
227:     {
228:         TStagstart,     // start of tag name
229:         TStag,          // in a tag name
230:         TSrest,         // following tag name
231:     };
232:     enum TagState state = TStagstart;
233:     int inot;
234:     unsigned char *tagstart = NULL;
235:     int taglen = 0;
236: 
237:     p++;
238:     inot = 0;
239:     if (*p == '/')
240:     {   inot = 1;
241:         p++;
242:     }
243:     while (1)
244:     {
245:         switch (*p)
246:         {
247:             case '>':           // found end of tag
248:                 p++;
249:                 break;
250: 
251:             case '"':
252:             case '\'':
253:                 state = TSrest;
254:                 skipString();
255:                 continue;
256: 
257:             case '<':
258:                 if (p[1] == '!' && isCommentStart())
259:                 {   // Comments start with <!--
260:                     scanComment();
261:                 }
262:                 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
263:                 {   error("nested tag");
264:                     skipTag();
265:                 }
266:                 else if (istagstart(*skipWhite(p + 1)))
267:                 {   error("nested tag");
268:                     skipTag();
269:                 }
270:                 // Treat comments as if they were whitespace
271:                 state = TSrest;
272:                 continue;
273: 
274:             case 0:
275:             case 0x1a:
276:                 error("end of file before end of tag");
277:                 break;          // end of file
278: 
279:             case '\r':
280:                 if (p[1] == '\n')
281:                     goto Ldefault;
282:             case '\n':
283:                 linnum++;
284:                 // Always extract new lines, so that code lexer counts the
285:                 // lines right.
286:                 dbuf->writeByte(*p);
287:                 state = TSrest;                 // end of tag
288:                 p++;
289:                 continue;
290: 
291:             case ' ':
292:             case '\t':
293:             case '\f':
294:             case '\v':
295:                 if (state == TStagstart)
296:                 {   p++;
297:                     continue;
298:                 }
299:             default:
300:             Ldefault:
301:                 switch (state)
302:                 {
303:                     case TStagstart:            // start of tag name
304:                         assert(istagstart(*p));
305:                         state = TStag;
306:                         tagstart = p;
307:                         taglen = 0;
308:                         break;
309: 
310:                     case TStag:
311:                         if (istag(*p))
312:                         {   // Continuing tag name
313:                             taglen++;
314:                         }
315:                         else
316:                         {   // End of tag name
317:                             state = TSrest;
318:                         }
319:                         break;
320: 
321:                     case TSrest:
322:                         break;
323:                 }
324:                 p++;
325:                 continue;
326:         }
327:         break;
328:     }
329: 
330:     // See if we parsed a <code> or </code> tag
331:     if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
332:         && *(p - 2) != '/') // ignore "<code />" (XHTML)
warning C4996: 'memicmp': The POSIX name for this item is deprecated. Instead, use the ISO C++ conformant name: _memicmp. See online help for details. c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\string.h(93) : see declaration of 'memicmp'
333: { 334: if (inot) 335: { inCode--; 336: if (inCode < 0) 337: inCode = 0; // ignore extra </code>'s 338: } 339: else 340: inCode++; 341: } 342: } 343: 344: /*********************************************** 345: * Scan to end of attribute string. 346: */ 347: 348: void Html::skipString() 349: { 350: int tc = *p; 351: 352: while (1) 353: { 354: p++; 355: switch (*p) 356: { 357: case '"': 358: case '\'': 359: if (*p == tc) 360: { p++; 361: break; 362: } 363: continue; 364: 365: case '\r': 366: if (p[1] == '\n') 367: goto Ldefault; 368: case '\n': 369: linnum++; 370: // Always extract new lines, so that D lexer counts the 371: // lines right. 372: dbuf->writeByte(*p); 373: continue; 374: 375: case 0: 376: case 0x1a: 377: Leof:
warning C4102: 'Leof' : unreferenced label
378: error("end of file before closing %c of string", tc); 379: break; 380: 381: default: 382: Ldefault: 383: continue; 384: } 385: break; 386: } 387: } 388: 389: /********************************* 390: * If p points to any white space, skip it 391: * and return pointer just past it. 392: */ 393: 394: unsigned char *Html::skipWhite(unsigned char *q) 395: { 396: for (; 1; q++) 397: { 398: switch (*q) 399: { 400: case ' ': 401: case '\t': 402: case '\f': 403: case '\v': 404: case '\r': 405: case '\n': 406: continue; 407: 408: default: 409: break; 410: } 411: break; 412: } 413: return q; 414: } 415: 416: /*************************************************** 417: * Scan to end of comment. 418: * Comments are defined any of a number of ways. 419: * IE 5.0: <!-- followed by > 420: * "HTML The Definitive Guide": <!-- text with at least one space in it --> 421: * Netscape: <!-- --> comments nest 422: * w3c: whitespace can appear between -- and > of comment close 423: */ 424: 425: void Html::scanComment() 426: { 427: // Most of the complexity is dealing with the case that 428: // an arbitrary amount of whitespace can appear between 429: // the -- and the > of a comment close. 430: int scangt = 0; 431: 432: //printf("scanComment()\n"); 433: if (*p == '\n') 434: { linnum++; 435: // Always extract new lines, so that D lexer counts the 436: // lines right. 437: dbuf->writeByte(*p); 438: } 439: while (1) 440: { 441: //scangt = 1; // IE 5.0 compatibility 442: p++; 443: switch (*p) 444: { 445: case '-': 446: if (p[1] == '-') 447: { 448: if (p[2] == '>') // optimize for most common case 449: { 450: p += 3; 451: break; 452: } 453: p++; 454: scangt = 1; 455: } 456: else 457: scangt = 0; 458: continue; 459: 460: case '>': 461: if (scangt) 462: { // found --> 463: p++; 464: break; 465: } 466: continue; 467: 468: case ' ': 469: case '\t': 470: case '\f': 471: case '\v': 472: // skip white space 473: continue; 474: 475: case '\r': 476: if (p[1] == '\n') 477: goto Ldefault; 478: case '\n': 479: linnum++; // remember to count lines 480: // Always extract new lines, so that D lexer counts the 481: // lines right. 482: dbuf->writeByte(*p); 483: continue; 484: 485: case 0: 486: case 0x1a: 487: error("end of file before closing --> of comment"); 488: break; 489: 490: default: 491: Ldefault: 492: scangt = 0; // it's not --> 493: continue; 494: } 495: break; 496: } 497: //printf("*p = '%c'\n", *p); 498: } 499: 500: /******************************************** 501: * Determine if we are at the start of a comment. 502: * Input: 503: * p is on the opening '<' 504: * Returns: 505: * 0 if not start of a comment 506: * 1 if start of a comment, p is adjusted to point past -- 507: */ 508: 509: int Html::isCommentStart() 510: #ifdef __DMC__ 511: __out(result) 512: { 513: if (result == 0) 514: ; 515: else if (result == 1) 516: { 517: assert(p[-2] == '-' && p[-1] == '-'); 518: } 519: else 520: assert(0); 521: } 522: __body 523: #endif /* __DMC__ */ 524: { unsigned char *s; 525: 526: if (p[0] == '<' && p[1] == '!') 527: { 528: for (s = p + 2; 1; s++) 529: { 530: switch (*s) 531: { 532: case ' ': 533: case '\t': 534: case '\r': 535: case '\f': 536: case '\v': 537: // skip white space, even though spec says no 538: // white space is allowed 539: continue; 540: 541: case '-': 542: if (s[1] == '-') 543: { 544: p = s + 2; 545: return 1; 546: } 547: goto No; 548: 549: default: 550: goto No; 551: } 552: } 553: } 554: No: 555: return 0; 556: } 557: 558: int Html::isCDATAStart() 559: { 560: const char * CDATA_START_MARKER = "<![CDATA["; 561: size_t len = strlen(CDATA_START_MARKER); 562: 563: if (strncmp((char*)p, CDATA_START_MARKER, len) == 0) 564: { 565: p += len; 566: return 1; 567: } 568: else 569: { 570: return 0; 571: } 572: } 573: 574: void Html::scanCDATA() 575: { 576: while(*p && *p != 0x1A) 577: { 578: int lineSepLength = isLineSeparator(p); 579: if (lineSepLength>0) 580: { 581: /* Always extract new lines, so that D lexer counts the lines 582: * right. 583: */ 584: linnum++; 585: dbuf->writeByte('\n'); 586: p += lineSepLength; 587: continue; 588: } 589: else if (p[0] == ']' && p[1] == ']' && p[2] == '>') 590: { 591: /* end of CDATA section */ 592: p += 3; 593: return; 594: } 595: else if (inCode) 596: { 597: /* this CDATA section contains D code */ 598: dbuf->writeByte(*p); 599: } 600: 601: p++; 602: } 603: } 604: 605: 606: /******************************************** 607: * Convert an HTML character entity into a character. 608: * Forms are: 609: * &name; named entity 610: * &#ddd; decimal 611: * &#xhhhh; hex 612: * Input: 613: * p is on the & 614: */ 615: 616: int Html::charEntity() 617: { int c = 0; 618: int v; 619: int hex; 620: unsigned char *pstart = p; 621: 622: //printf("Html::charEntity('%c')\n", *p); 623: if (p[1] == '#') 624: { 625: p++; 626: if (p[1] == 'x' || p[1] == 'X') 627: { p++; 628: hex = 1; 629: } 630: else 631: hex = 0; 632: if (p[1] == ';') 633: goto Linvalid; 634: while (1) 635: { 636: p++; 637: switch (*p) 638: { 639: case 0: 640: case 0x1a: 641: error("end of file before end of character entity"); 642: goto Lignore; 643: 644: case '\n': 645: case '\r': 646: case '<': // tag start 647: // Termination is assumed 648: break; 649: 650: case ';': 651: // Termination is explicit 652: p++; 653: break; 654: 655: case '0': case '1': case '2': case '3': case '4': 656: case '5': case '6': case '7': case '8': case '9': 657: v = *p - '0'; 658: goto Lvalue; 659: 660: case 'a': case 'b': case 'c': 661: case 'd': case 'e': case 'f': 662: if (!hex) 663: goto Linvalid; 664: v = (*p - 'a') + 10; 665: goto Lvalue; 666: 667: case 'A': case 'B': case 'C': 668: case 'D': case 'E': case 'F': 669: if (!hex) 670: goto Linvalid; 671: v = (*p - 'A') + 10; 672: goto Lvalue; 673: 674: Lvalue: 675: if (hex) 676: c = (c << 4) + v; 677: else 678: c = (c * 10) + v; 679: if (c > 0x10FFFF) 680: { 681: error("character entity out of range"); 682: goto Lignore; 683: } 684: continue; 685: 686: default: 687: Linvalid: 688: error("invalid numeric character reference"); 689: goto Lignore; 690: } 691: break; 692: } 693: } 694: else 695: { 696: // It's a named entity; gather all characters until ; 697: unsigned char *idstart = p + 1; 698: 699: while (1) 700: { 701: p++; 702: switch (*p) 703: { 704: case 0: 705: case 0x1a: 706: error("end of file before end of character entity"); 707: break; 708: 709: case '\n': 710: case '\r': 711: case '<': // tag start 712: // Termination is assumed 713: c = HtmlNamedEntity(idstart, p - idstart); 714: if (c == -1) 715: goto Lignore; 716: break; 717: 718: case ';': 719: // Termination is explicit 720: c = HtmlNamedEntity(idstart, p - idstart); 721: if (c == -1) 722: goto Lignore; 723: p++; 724: break; 725: 726: default: 727: continue; 728: } 729: break; 730: } 731: } 732: 733: // Kludge to convert non-breaking space to ascii space 734: if (c == 160) 735: c = ' '; 736: 737: return c; 738: 739: Lignore: 740: //printf("Lignore\n"); 741: p = pstart + 1; 742: return '&'; 743: } 744: 745: /** 746: * identify DOS, Linux, Mac, Next and Unicode line endings 747: * 0 if this is no line separator 748: * >0 the length of the separator 749: * Note: input has to be UTF-8 750: */ 751: static int isLineSeparator(const unsigned char* p) 752: { 753: // Linux 754: if( p[0]=='\n') 755: return 1; 756: 757: // Mac & Dos 758: if( p[0]=='\r') 759: return (p[1]=='\n') ? 2 : 1; 760: 761: // Unicode (line || paragraph sep.) 762: if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9)) 763: return 3; 764: 765: // Next 766: if( p[0]==0xC2 && p[1]==0x85) 767: return 2; 768: 769: return 0; 770: } 771: 772: 773: