1: 
  2: // Copyright (c) 1999-2009 by Digital Mars
  3: // All Rights Reserved
  4: // written by Walter Bright
  5: // http://www.digitalmars.com
  6: // License for redistribution is by either the Artistic License
  7: // in artistic.txt, or the GNU General Public License in gpl.txt.
  8: // See the included readme.txt for details.
  9: 
 10: 
 11: /* HTML parser
 12:  */
 13: 
 14: #include <stdio.h>
 15: #include <string.h>
 16: #include <ctype.h>
 17: #include <stdarg.h>
 18: #include <errno.h>
 19: #include <wchar.h>
 20: 
 21: #include "html.h"
 22: 
 23: #if MARS
 24: static char __file__[] = __FILE__;      /* for tassert.h                */
 25: #include        "tassert.h"
 26: #include "root.h"
 27: //#include "../mars/mars.h"
 28: #else
 29: #include "outbuf.h"
 30: #include "msgs2.h"
 31: 
 32: extern void html_err(const char *, unsigned, unsigned, ...);
 33: 
 34: static char __file__[] = __FILE__;      /* for tassert.h                */
 35: #include        "tassert.h"
 36: #endif
 37: 
 38: #if __GNUC__
 39: int memicmp(const char *s1, const char *s2, int n);
 40: #if 0
 41: {
 42:     int result = 0;
 43: 
 44:     for (int i = 0; i < n; i++)
 45:     {   char c1 = s1[i];
 46:         char c2 = s2[i];
 47: 
 48:         result = c1 - c2;
 49:         if (result)
 50:         {
 51:             if ('A' <= c1 && c1 <= 'Z')
 52:                 c1 += 'a' - 'A';
 53:             if ('A' <= c2 && c2 <= 'Z')
 54:                 c2 += 'a' - 'A';
 55:             result = c1 - c2;
 56:             if (result)
 57:                 break;
 58:         }
 59:     }
 60:     return result;
 61: }
 62: #endif
 63: #endif
 64: 
 65: extern int HtmlNamedEntity(unsigned char *p, int length);
 66: 
 67: static int isLineSeparator(const unsigned char* p);
 68: 
 69: /**********************************
 70:  * Determine if beginning of tag identifier
 71:  * or a continuation of a tag identifier.
 72:  */
 73: 
 74: inline int istagstart(int c)
 75: {
 76:     return (isalpha(c) || c == '_');
 77: }
 78: 
 79: inline int istag(int c)
 80: {
 81:     return (isalnum(c) || c == '_');
 82: }
 83: 
 84: /**********************************************
 85:  */
 86: 
 87: Html::Html(const char *sourcename, unsigned char *base, unsigned length)
 88: {
 89:     //printf("Html::Html()\n");
 90:     this->sourcename = sourcename;
 91:     this->base = base;
 92:     p = base;
 93:     end = base + length;
 94:     linnum = 1;
 95:     dbuf = NULL;
 96:     inCode = 0;
 97: }
 98: 
 99: /**********************************************
100:  * Print error & quit.
101:  */
102: 
103: void Html::error(const char *format, ...)
104: {
105:     printf("%s(%d) : HTML Error: ", sourcename, linnum);
106: 
107:     va_list ap;
108:     va_start(ap, format);
109:     vprintf(format, ap);
110:     va_end(ap);
111: 
112:     printf("\n");
113:     fflush(stdout);
114: 
115: //#if MARS
116: //    global.errors++;
117: //#else
118:     exit(EXIT_FAILURE);
119: //#endif
120: }
121: 
122: /**********************************************
123:  * Extract all the code from an HTML file,
124:  * concatenate it all together, and store in buf.
125:  */
126: 
127: #if MARS
128: void Html::extractCode(OutBuffer *buf)
129: #else
130: void Html::extractCode(Outbuffer *buf)
131: #endif
132: {
133:     //printf("Html::extractCode()\n");
134:     dbuf = buf;                 // save for other routines
135:     buf->reserve(end - p);
136:     inCode = 0;
137:     while (1)
138:     {
139:         //printf("p = %p, *p = x%x\n", p, *p);
140:         switch (*p)
141:         {
142: #if 0 // strings are not recognized outside of tags
143:             case '"':
144:             case '\'':
145:                 skipString();
146:                 continue;
147: #endif
148:             case '<':
149:                 if (p[1] == '!' && isCommentStart())
150:                 {   // Comments start with <!--
151:                     scanComment();
152:                 }
153:                 else if(p[1] == '!' && isCDATAStart())
154:                 {
155:                     scanCDATA();
156:                 }
157:                 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
158:                     skipTag();
159:                 else if (istagstart(*skipWhite(p + 1)))
160:                     skipTag();
161:                 else
162:                     goto Ldefault;
163:                 continue;
164: 
165:             case 0:
166:             case 0x1a:
167:                 break;          // end of file
168: 
169:             case '&':
170:                 if (inCode)
171:                 {   // Translate character entity into ascii for D parser
172:                     int c;
173: 
174:                     c = charEntity();
175: #if MARS
176:                     buf->writeUTF8(c);
177: #else
178:                     buf->writeByte(c);
179: #endif
180:                 }
181:                 else
182:                     p++;
183:                 continue;
184: 
185:             case '\r':
186:                 if (p[1] == '\n')
187:                     goto Ldefault;
188:             case '\n':
189:                 linnum++;
190:                 // Always extract new lines, so that D lexer counts the
191:                 // lines right.
192:                 buf->writeByte(*p);
193:                 p++;
194:                 continue;
195: 
196:             default:
197:             Ldefault:
198:                 if (inCode)
199:                     buf->writeByte(*p);
200:                 p++;
201:                 continue;
202:         }
203:         break;
204:     }
205:     buf->writeByte(0);                          // ending sentinel
206: #if SCPP
207:     //printf("Code is: '%s'\n", buf->toString() + 3);
208: #endif
209: #if MARS
210:     //printf("D code is: '%s'\n", (char *)buf->data);
211: #endif
212: }
213: 
214: /***********************************************
215:  * Scan to end of <> tag.
216:  * Look for <code> and </code> tags to start/stop D processing.
217:  * Input:
218:  *      p is on opening '<' of tag; it's already verified that
219:  *      it's a tag by lookahead
220:  * Output:
221:  *      p is past closing '>' of tag
222:  */
223: 
224: void Html::skipTag()
225: {
226:     enum TagState       // what parsing state we're in
227:     {
228:         TStagstart,     // start of tag name
229:         TStag,          // in a tag name
230:         TSrest,         // following tag name
231:     };
232:     enum TagState state = TStagstart;
233:     int inot;
234:     unsigned char *tagstart = NULL;
235:     int taglen = 0;
236: 
237:     p++;
238:     inot = 0;
239:     if (*p == '/')
240:     {   inot = 1;
241:         p++;
242:     }
243:     while (1)
244:     {
245:         switch (*p)
246:         {
247:             case '>':           // found end of tag
248:                 p++;
249:                 break;
250: 
251:             case '"':
252:             case '\'':
253:                 state = TSrest;
254:                 skipString();
255:                 continue;
256: 
257:             case '<':
258:                 if (p[1] == '!' && isCommentStart())
259:                 {   // Comments start with <!--
260:                     scanComment();
261:                 }
262:                 else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
263:                 {   error("nested tag");
264:                     skipTag();
265:                 }
266:                 else if (istagstart(*skipWhite(p + 1)))
267:                 {   error("nested tag");
268:                     skipTag();
269:                 }
270:                 // Treat comments as if they were whitespace
271:                 state = TSrest;
272:                 continue;
273: 
274:             case 0:
275:             case 0x1a:
276:                 error("end of file before end of tag");
277:                 break;          // end of file
278: 
279:             case '\r':
280:                 if (p[1] == '\n')
281:                     goto Ldefault;
282:             case '\n':
283:                 linnum++;
284:                 // Always extract new lines, so that code lexer counts the
285:                 // lines right.
286:                 dbuf->writeByte(*p);
287:                 state = TSrest;                 // end of tag
288:                 p++;
289:                 continue;
290: 
291:             case ' ':
292:             case '\t':
293:             case '\f':
294:             case '\v':
295:                 if (state == TStagstart)
296:                 {   p++;
297:                     continue;
298:                 }
299:             default:
300:             Ldefault:
301:                 switch (state)
302:                 {
303:                     case TStagstart:            // start of tag name
304:                         assert(istagstart(*p));
305:                         state = TStag;
306:                         tagstart = p;
307:                         taglen = 0;
308:                         break;
309: 
310:                     case TStag:
311:                         if (istag(*p))
312:                         {   // Continuing tag name
313:                             taglen++;
314:                         }
315:                         else
316:                         {   // End of tag name
317:                             state = TSrest;
318:                         }
319:                         break;
320: 
321:                     case TSrest:
322:                         break;
323:                 }
324:                 p++;
325:                 continue;
326:         }
327:         break;
328:     }
329: 
330:     // See if we parsed a <code> or </code> tag
331:     if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
332:         && *(p - 2) != '/') // ignore "<code />" (XHTML)
warning C4996: 'memicmp': The POSIX name for this item is deprecated. Instead, use the ISO C++ conformant name: _memicmp. See online help for details.
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\string.h(93) : see declaration of 'memicmp'
333:     {
334:         if (inot)
335:         {   inCode--;
336:             if (inCode < 0)
337:                 inCode = 0;             // ignore extra </code>'s
338:         }
339:         else
340:             inCode++;
341:     }
342: }
343: 
344: /***********************************************
345:  * Scan to end of attribute string.
346:  */
347: 
348: void Html::skipString()
349: {
350:     int tc = *p;
351: 
352:     while (1)
353:     {
354:         p++;
355:         switch (*p)
356:         {
357:             case '"':
358:             case '\'':
359:                 if (*p == tc)
360:                 {   p++;
361:                     break;
362:                 }
363:                 continue;
364: 
365:             case '\r':
366:                 if (p[1] == '\n')
367:                     goto Ldefault;
368:             case '\n':
369:                 linnum++;
370:                 // Always extract new lines, so that D lexer counts the
371:                 // lines right.
372:                 dbuf->writeByte(*p);
373:                 continue;
374: 
375:             case 0:
376:             case 0x1a:
377:             Leof:
warning C4102: 'Leof' : unreferenced label
378:                 error("end of file before closing %c of string", tc);
379:                 break;
380: 
381:             default:
382:             Ldefault:
383:                 continue;
384:         }
385:         break;
386:     }
387: }
388: 
389: /*********************************
390:  * If p points to any white space, skip it
391:  * and return pointer just past it.
392:  */
393: 
394: unsigned char *Html::skipWhite(unsigned char *q)
395: {
396:     for (; 1; q++)
397:     {
398:         switch (*q)
399:         {
400:             case ' ':
401:             case '\t':
402:             case '\f':
403:             case '\v':
404:             case '\r':
405:             case '\n':
406:                 continue;
407: 
408:             default:
409:                 break;
410:         }
411:         break;
412:     }
413:     return q;
414: }
415: 
416: /***************************************************
417:  * Scan to end of comment.
418:  * Comments are defined any of a number of ways.
419:  * IE 5.0: <!-- followed by >
420:  * "HTML The Definitive Guide": <!-- text with at least one space in it -->
421:  * Netscape: <!-- --> comments nest
422:  * w3c: whitespace can appear between -- and > of comment close
423:  */
424: 
425: void Html::scanComment()
426: {
427:     // Most of the complexity is dealing with the case that
428:     // an arbitrary amount of whitespace can appear between
429:     // the -- and the > of a comment close.
430:     int scangt = 0;
431: 
432:     //printf("scanComment()\n");
433:     if (*p == '\n')
434:     {   linnum++;
435:         // Always extract new lines, so that D lexer counts the
436:         // lines right.
437:         dbuf->writeByte(*p);
438:     }
439:     while (1)
440:     {
441:         //scangt = 1;                   // IE 5.0 compatibility
442:         p++;
443:         switch (*p)
444:         {
445:             case '-':
446:                 if (p[1] == '-')
447:                 {
448:                     if (p[2] == '>')    // optimize for most common case
449:                     {
450:                         p += 3;
451:                         break;
452:                     }
453:                     p++;
454:                     scangt = 1;
455:                 }
456:                 else
457:                     scangt = 0;
458:                 continue;
459: 
460:             case '>':
461:                 if (scangt)
462:                 {   // found -->
463:                     p++;
464:                     break;
465:                 }
466:                 continue;
467: 
468:             case ' ':
469:             case '\t':
470:             case '\f':
471:             case '\v':
472:                 // skip white space
473:                 continue;
474: 
475:             case '\r':
476:                 if (p[1] == '\n')
477:                     goto Ldefault;
478:             case '\n':
479:                 linnum++;               // remember to count lines
480:                 // Always extract new lines, so that D lexer counts the
481:                 // lines right.
482:                 dbuf->writeByte(*p);
483:                 continue;
484: 
485:             case 0:
486:             case 0x1a:
487:                 error("end of file before closing --> of comment");
488:                 break;
489: 
490:             default:
491:             Ldefault:
492:                 scangt = 0;             // it's not -->
493:                 continue;
494:         }
495:         break;
496:     }
497:     //printf("*p = '%c'\n", *p);
498: }
499: 
500: /********************************************
501:  * Determine if we are at the start of a comment.
502:  * Input:
503:  *      p is on the opening '<'
504:  * Returns:
505:  *      0 if not start of a comment
506:  *      1 if start of a comment, p is adjusted to point past --
507:  */
508: 
509: int Html::isCommentStart()
510: #ifdef __DMC__
511:     __out(result)
512:     {
513:         if (result == 0)
514:             ;
515:         else if (result == 1)
516:         {
517:             assert(p[-2] == '-' && p[-1] == '-');
518:         }
519:         else
520:             assert(0);
521:     }
522:     __body
523: #endif /* __DMC__ */
524:     {   unsigned char *s;
525: 
526:         if (p[0] == '<' && p[1] == '!')
527:         {
528:             for (s = p + 2; 1; s++)
529:             {
530:                 switch (*s)
531:                 {
532:                     case ' ':
533:                     case '\t':
534:                     case '\r':
535:                     case '\f':
536:                     case '\v':
537:                         // skip white space, even though spec says no
538:                         // white space is allowed
539:                         continue;
540: 
541:                     case '-':
542:                         if (s[1] == '-')
543:                         {
544:                             p = s + 2;
545:                             return 1;
546:                         }
547:                         goto No;
548: 
549:                     default:
550:                         goto No;
551:                 }
552:             }
553:         }
554:     No:
555:         return 0;
556:     }
557: 
558: int Html::isCDATAStart()
559: {
560:     const char * CDATA_START_MARKER = "<![CDATA[";
561:     size_t len = strlen(CDATA_START_MARKER);
562: 
563:     if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
564:     {
565:         p += len;
566:         return 1;
567:     }
568:     else
569:     {
570:         return 0;
571:     }
572: }
573: 
574: void Html::scanCDATA()
575: {
576:     while(*p && *p != 0x1A)
577:     {
578:         int lineSepLength = isLineSeparator(p);
579:         if (lineSepLength>0)
580:         {
581:             /* Always extract new lines, so that D lexer counts the lines
582:              * right.
583:              */
584:             linnum++;
585:             dbuf->writeByte('\n');
586:             p += lineSepLength;
587:             continue;
588:         }
589:         else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
590:         {
591:             /* end of CDATA section */
592:             p += 3;
593:             return;
594:         }
595:         else if (inCode)
596:         {
597:             /* this CDATA section contains D code */
598:             dbuf->writeByte(*p);
599:         }
600: 
601:         p++;
602:     }
603: }
604: 
605: 
606: /********************************************
607:  * Convert an HTML character entity into a character.
608:  * Forms are:
609:  *      &name;          named entity
610:  *      &#ddd;          decimal
611:  *      &#xhhhh;        hex
612:  * Input:
613:  *      p is on the &
614:  */
615: 
616: int Html::charEntity()
617: {   int c = 0;
618:     int v;
619:     int hex;
620:     unsigned char *pstart = p;
621: 
622:     //printf("Html::charEntity('%c')\n", *p);
623:     if (p[1] == '#')
624:     {
625:         p++;
626:         if (p[1] == 'x' || p[1] == 'X')
627:         {   p++;
628:             hex = 1;
629:         }
630:         else
631:             hex = 0;
632:         if (p[1] == ';')
633:             goto Linvalid;
634:         while (1)
635:         {
636:             p++;
637:             switch (*p)
638:             {
639:                 case 0:
640:                 case 0x1a:
641:                     error("end of file before end of character entity");
642:                     goto Lignore;
643: 
644:                 case '\n':
645:                 case '\r':
646:                 case '<':       // tag start
647:                     // Termination is assumed
648:                     break;
649: 
650:                 case ';':
651:                     // Termination is explicit
652:                     p++;
653:                     break;
654: 
655:                 case '0': case '1': case '2': case '3': case '4':
656:                 case '5': case '6': case '7': case '8': case '9':
657:                     v = *p - '0';
658:                     goto Lvalue;
659: 
660:                 case 'a': case 'b': case 'c':
661:                 case 'd': case 'e': case 'f':
662:                     if (!hex)
663:                         goto Linvalid;
664:                     v = (*p - 'a') + 10;
665:                     goto Lvalue;
666: 
667:                 case 'A': case 'B': case 'C':
668:                 case 'D': case 'E': case 'F':
669:                     if (!hex)
670:                         goto Linvalid;
671:                     v = (*p - 'A') + 10;
672:                     goto Lvalue;
673: 
674:                 Lvalue:
675:                     if (hex)
676:                         c = (c << 4) + v;
677:                     else
678:                         c = (c * 10) + v;
679:                     if (c > 0x10FFFF)
680:                     {
681:                         error("character entity out of range");
682:                         goto Lignore;
683:                     }
684:                     continue;
685: 
686:                 default:
687:                 Linvalid:
688:                     error("invalid numeric character reference");
689:                     goto Lignore;
690:             }
691:             break;
692:         }
693:     }
694:     else
695:     {
696:         // It's a named entity; gather all characters until ;
697:         unsigned char *idstart = p + 1;
698: 
699:         while (1)
700:         {
701:             p++;
702:             switch (*p)
703:             {
704:                 case 0:
705:                 case 0x1a:
706:                     error("end of file before end of character entity");
707:                     break;
708: 
709:                 case '\n':
710:                 case '\r':
711:                 case '<':       // tag start
712:                     // Termination is assumed
713:                     c = HtmlNamedEntity(idstart, p - idstart);
714:                     if (c == -1)
715:                         goto Lignore;
716:                     break;
717: 
718:                 case ';':
719:                     // Termination is explicit
720:                     c = HtmlNamedEntity(idstart, p - idstart);
721:                     if (c == -1)
722:                         goto Lignore;
723:                     p++;
724:                     break;
725: 
726:                 default:
727:                     continue;
728:             }
729:             break;
730:         }
731:     }
732: 
733:     // Kludge to convert non-breaking space to ascii space
734:     if (c == 160)
735:         c = ' ';
736: 
737:     return c;
738: 
739: Lignore:
740:     //printf("Lignore\n");
741:     p = pstart + 1;
742:     return '&';
743: }
744: 
745: /**
746:  * identify DOS, Linux, Mac, Next and Unicode line endings
747:  * 0 if this is no line separator
748:  * >0 the length of the separator
749:  * Note: input has to be UTF-8
750:  */
751: static int isLineSeparator(const unsigned char* p)
752: {
753:     // Linux
754:     if( p[0]=='\n')
755:         return 1;
756: 
757:     // Mac & Dos
758:     if( p[0]=='\r')
759:         return (p[1]=='\n') ? 2 : 1;
760: 
761:     // Unicode (line || paragraph sep.)
762:     if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
763:         return 3;
764: 
765:     // Next
766:     if( p[0]==0xC2 && p[1]==0x85)
767:         return 2;
768: 
769:     return 0;
770: }
771: 
772: 
773: