~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/lexer.h

Version: ~ [ 1.0 ] ~

** Warning: Cannot open xref database.

1 #ifndef __LEXER_H__ 2 #define __LEXER_H__ 3 4 /* lexer.h -- Lexer for html parser 5 6 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University 7 See tidy.h for the copyright notice. 8 9 CVS Info: 10 $Author: arnaud02 $ 11 $Date: 2005/08/26 16:08:45 $ 12 $Revision: 1.30 $ 13 14 */ 15 16 /* 17 Given an input source, it returns a sequence of tokens. 18 19 GetToken(source) gets the next token 20 UngetToken(source) provides one level undo 21 22 The tags include an attribute list: 23 24 - linked list of attribute/value nodes 25 - each node has 2 NULL-terminated strings. 26 - entities are replaced in attribute values 27 28 white space is compacted if not in preformatted mode 29 If not in preformatted mode then leading white space 30 is discarded and subsequent white space sequences 31 compacted to single space characters. 32 33 If XmlTags is no then Tag names are folded to upper 34 case and attribute names to lower case. 35 36 Not yet done: 37 - Doctype subset and marked sections 38 */ 39 40 #ifdef __cplusplus 41 extern "C" { 42 #endif 43 44 #include "forward.h" 45 46 /* lexer character types 47 */ 48 #define digit 1 49 #define letter 2 50 #define namechar 4 51 #define white 8 52 #define newline 16 53 #define lowercase 32 54 #define uppercase 64 55 56 57 /* node->type is one of these values 58 */ 59 typedef enum 60 { 61 RootNode, 62 DocTypeTag, 63 CommentTag, 64 ProcInsTag, 65 TextNode, 66 StartTag, 67 EndTag, 68 StartEndTag, 69 CDATATag, 70 SectionTag, 71 AspTag, 72 JsteTag, 73 PhpTag, 74 XmlDecl 75 } NodeType; 76 77 78 79 /* lexer GetToken states 80 */ 81 typedef enum 82 { 83 LEX_CONTENT, 84 LEX_GT, 85 LEX_ENDTAG, 86 LEX_STARTTAG, 87 LEX_COMMENT, 88 LEX_DOCTYPE, 89 LEX_PROCINSTR, 90 LEX_ENDCOMMENT, 91 LEX_CDATA, 92 LEX_SECTION, 93 LEX_ASP, 94 LEX_JSTE, 95 LEX_PHP, 96 LEX_XMLDECL 97 } LexerState; 98 99 /* ParseDocTypeDecl state constants */ 100 typedef enum 101 { 102 DT_INTERMEDIATE, 103 DT_DOCTYPENAME, 104 DT_PUBLICSYSTEM, 105 DT_QUOTEDSTRING, 106 DT_INTSUBSET 107 } ParseDocTypeDeclState; 108 109 /* content model shortcut encoding 110 111 Descriptions are tentative. 112 */ 113 #define CM_UNKNOWN 0 114 /* Elements with no content. Map to HTML specification. */ 115 #define CM_EMPTY (1 << 0) 116 /* Elements that appear outside of "BODY". */ 117 #define CM_HTML (1 << 1) 118 /* Elements that can appear within HEAD. */ 119 #define CM_HEAD (1 << 2) 120 /* HTML "block" elements. */ 121 #define CM_BLOCK (1 << 3) 122 /* HTML "inline" elements. */ 123 #define CM_INLINE (1 << 4) 124 /* Elements that mark list item ("LI"). */ 125 #define CM_LIST (1 << 5) 126 /* Elements that mark definition list item ("DL", "DT"). */ 127 #define CM_DEFLIST (1 << 6) 128 /* Elements that can appear inside TABLE. */ 129 #define CM_TABLE (1 << 7) 130 /* Used for "THEAD", "TFOOT" or "TBODY". */ 131 #define CM_ROWGRP (1 << 8) 132 /* Used for "TD", "TH" */ 133 #define CM_ROW (1 << 9) 134 /* Elements whose content must be protected against white space movement. 135 Includes some elements that can found in forms. */ 136 #define CM_FIELD (1 << 10) 137 /* Used to avoid propagating inline emphasis inside some elements 138 such as OBJECT or APPLET. */ 139 #define CM_OBJECT (1 << 11) 140 /* Elements that allows "PARAM". */ 141 #define CM_PARAM (1 << 12) 142 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ 143 #define CM_FRAMES (1 << 13) 144 /* Heading elements (h1, h2, ...). */ 145 #define CM_HEADING (1 << 14) 146 /* Elements with an optional end tag. */ 147 #define CM_OPT (1 << 15) 148 /* Elements that use "align" attribute for vertical position. */ 149 #define CM_IMG (1 << 16) 150 /* Elements with inline and block model. Used to avoid calling InlineDup. */ 151 #define CM_MIXED (1 << 17) 152 /* Elements whose content needs to be indented only if containing one 153 CM_BLOCK element. */ 154 #define CM_NO_INDENT (1 << 18) 155 /* Elements that are obsolete (such as "dir", "menu"). */ 156 #define CM_OBSOLETE (1 << 19) 157 /* User defined elements. Used to determine how attributes wihout value 158 should be printed. */ 159 #define CM_NEW (1 << 20) 160 /* Elements that cannot be omitted. */ 161 #define CM_OMITST (1 << 21) 162 163 /* If the document uses just HTML 2.0 tags and attributes described 164 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. 165 ** If there are proprietary tags and attributes then describe it as 166 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes 167 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the 168 ** flavors of Voyager (strict, loose or frameset). 169 */ 170 171 /* unknown */ 172 #define xxxx 0u 173 174 /* W3C defined HTML/XHTML family document types */ 175 #define HT20 1u 176 #define HT32 2u 177 #define H40S 4u 178 #define H40T 8u 179 #define H40F 16u 180 #define H41S 32u 181 #define H41T 64u 182 #define H41F 128u 183 #define X10S 256u 184 #define X10T 512u 185 #define X10F 1024u 186 #define XH11 2048u 187 #define XB10 4096u 188 189 /* proprietary stuff */ 190 #define VERS_SUN 8192u 191 #define VERS_NETSCAPE 16384u 192 #define VERS_MICROSOFT 32768u 193 194 /* special flag */ 195 #define VERS_XML 65536u 196 197 /* compatibility symbols */ 198 #define VERS_UNKNOWN (xxxx) 199 #define VERS_HTML20 (HT20) 200 #define VERS_HTML32 (HT32) 201 #define VERS_HTML40_STRICT (H40S|H41S|X10S) 202 #define VERS_HTML40_LOOSE (H40T|H41T|X10T) 203 #define VERS_FRAMESET (H40F|H41F|X10F) 204 #define VERS_XHTML11 (XH11) 205 #define VERS_BASIC (XB10) 206 207 /* meta symbols */ 208 #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET) 209 #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET) 210 #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME) 211 #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11) 212 #define VERS_FROM32 (VERS_HTML32|VERS_HTML40) 213 #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC) 214 #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10) 215 216 /* all W3C defined document types */ 217 #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40) 218 219 /* all proprietary types */ 220 #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) 221 222 /* Linked list of class names and styles 223 */ 224 struct _Style; 225 typedef struct _Style TagStyle; 226 227 struct _Style 228 { 229 tmbstr tag; 230 tmbstr tag_class; 231 tmbstr properties; 232 TagStyle *next; 233 }; 234 235 236 /* Linked list of style properties 237 */ 238 struct _StyleProp; 239 typedef struct _StyleProp StyleProp; 240 241 struct _StyleProp 242 { 243 tmbstr name; 244 tmbstr value; 245 StyleProp *next; 246 }; 247 248 249 250 251 /* Attribute/Value linked list node 252 */ 253 254 struct _AttVal 255 { 256 AttVal* next; 257 const Attribute* dict; 258 Node* asp; 259 Node* php; 260 int delim; 261 tmbstr attribute; 262 tmbstr value; 263 }; 264 265 266 267 /* 268 Mosaic handles inlines via a separate stack from other elements 269 We duplicate this to recover from inline markup errors such as: 270 271 <i>italic text 272 <p>more italic text</b> normal text 273 274 which for compatibility with Mosaic is mapped to: 275 276 <i>italic text</i> 277 <p><i>more italic text</i> normal text 278 279 Note that any inline end tag pop's the effect of the current 280 inline start tag, so that </b> pop's <i> in the above example. 281 */ 282 struct _IStack 283 { 284 IStack* next; 285 const Dict* tag; /* tag's dictionary definition */ 286 tmbstr element; /* name (NULL for text nodes) */ 287 AttVal* attributes; 288 }; 289 290 291 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, 292 ** etc. etc. 293 */ 294 295 struct _Node 296 { 297 Node* parent; /* tree structure */ 298 Node* prev; 299 Node* next; 300 Node* content; 301 Node* last; 302 303 AttVal* attributes; 304 const Dict* was; /* old tag when it was changed */ 305 const Dict* tag; /* tag's dictionary definition */ 306 307 tmbstr element; /* name (NULL for text nodes) */ 308 309 uint start; /* start of span onto text array */ 310 uint end; /* end of span onto text array */ 311 NodeType type; /* TextNode, StartTag, EndTag etc. */ 312 313 uint line; /* current line of document */ 314 uint column; /* current column of document */ 315 316 Bool closed; /* true if closed by explicit end tag */ 317 Bool implicit; /* true if inferred */ 318 Bool linebreak; /* true if followed by a line break */ 319 320 #ifdef TIDY_STORE_ORIGINAL_TEXT 321 tmbstr otext; 322 #endif 323 }; 324 325 326 /* 327 The following are private to the lexer 328 Use NewLexer() to create a lexer, and 329 FreeLexer() to free it. 330 */ 331 332 struct _Lexer 333 { 334 #if 0 /* Move to TidyDocImpl */ 335 StreamIn* in; /* document content input */ 336 StreamOut* errout; /* error output stream */ 337 338 uint badAccess; /* for accessibility errors */ 339 uint badLayout; /* for bad style errors */ 340 uint badChars; /* for bad character encodings */ 341 uint badForm; /* for mismatched/mispositioned form tags */ 342 uint warnings; /* count of warnings in this document */ 343 uint errors; /* count of errors */ 344 #endif 345 346 uint lines; /* lines seen */ 347 uint columns; /* at start of current token */ 348 Bool waswhite; /* used to collapse contiguous white space */ 349 Bool pushed; /* true after token has been pushed back */ 350 Bool insertspace; /* when space is moved after end tag */ 351 Bool excludeBlocks; /* Netscape compatibility */ 352 Bool exiled; /* true if moved out of table */ 353 Bool isvoyager; /* true if xmlns attribute on html element */ 354 uint versions; /* bit vector of HTML versions */ 355 uint doctype; /* version as given by doctype (if any) */ 356 uint versionEmitted; /* version of doctype emitted */ 357 Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ 358 uint txtstart; /* start of current node */ 359 uint txtend; /* end of current node */ 360 LexerState state; /* state of lexer's finite state machine */ 361 362 Node* token; /* current parse point */ 363 Node* root; /* remember root node of the document */ 364 Node* parent; /* remember parent node for CDATA elements */ 365 366 Bool seenEndBody; /* true if a </body> tag has been encountered */ 367 Bool seenEndHtml; /* true if a </html> tag has been encountered */ 368 369 /* 370 Lexer character buffer 371 372 Parse tree nodes span onto this buffer 373 which contains the concatenated text 374 contents of all of the elements. 375 376 lexsize must be reset for each file. 377 */ 378 tmbstr lexbuf; /* MB character buffer */ 379 uint lexlength; /* allocated */ 380 uint lexsize; /* used */ 381 382 /* Inline stack for compatibility with Mosaic */ 383 Node* inode; /* for deferring text node */ 384 IStack* insert; /* for inferring inline tags */ 385 IStack* istack; 386 uint istacklength; /* allocated */ 387 uint istacksize; /* used */ 388 uint istackbase; /* start of frame */ 389 390 TagStyle *styles; /* used for cleaning up presentation markup */ 391 392 #if 0 393 TidyDocImpl* doc; /* Pointer back to doc for error reporting */ 394 #endif 395 }; 396 397 398 /* Lexer Functions 399 */ 400 Node *CommentToken( Lexer *lexer ); 401 402 /* choose what version to use for new doctype */ 403 int HTMLVersion( TidyDocImpl* doc ); 404 405 ctmbstr GetFPIFromVers(uint vers); 406 407 /* everything is allowed in proprietary version of HTML */ 408 /* this is handled here rather than in the tag/attr dicts */ 409 410 void ConstrainVersion( TidyDocImpl* doc, uint vers ); 411 412 Bool IsWhite(uint c); 413 Bool IsDigit(uint c); 414 Bool IsLetter(uint c); 415 Bool IsNewline(uint c); 416 Bool IsNamechar(uint c); 417 Bool IsXMLLetter(uint c); 418 Bool IsXMLNamechar(uint c); 419 420 Bool IsLower(uint c); 421 Bool IsUpper(uint c); 422 uint ToLower(uint c); 423 uint ToUpper(uint c); 424 425 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps ); 426 427 428 Lexer* NewLexer( TidyDocImpl* doc ); 429 Bool EndOfInput( TidyDocImpl* doc ); 430 void FreeLexer( TidyDocImpl* doc ); 431 432 /* store character c as UTF-8 encoded byte stream */ 433 void AddCharToLexer( Lexer *lexer, uint c ); 434 435 /* 436 Used for elements and text nodes 437 element name is NULL for text nodes 438 start and end are offsets into lexbuf 439 which contains the textual content of 440 all elements in the parse tree. 441 442 parent and content allow traversal 443 of the parse tree in any direction. 444 attributes are represented as a linked 445 list of AttVal nodes which hold the 446 strings for attribute/value pairs. 447 */ 448 Node* NewNode( Lexer* lexer ); 449 450 451 /* used to clone heading nodes when split by an <HR> */ 452 Node *CloneNode( TidyDocImpl* doc, Node *element ); 453 454 /* free node's attributes */ 455 void FreeAttrs( TidyDocImpl* doc, Node *node ); 456 457 /* doesn't repair attribute list linkage */ 458 void FreeAttribute( TidyDocImpl* doc, AttVal *av ); 459 460 /* detach attribute from node */ 461 void DetachAttribute( Node *node, AttVal *attr ); 462 463 /* detach attribute from node then free it 464 */ 465 void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr ); 466 467 /* 468 Free document nodes by iterating through peers and recursing 469 through children. Set next to NULL before calling FreeNode() 470 to avoid freeing peer nodes. Doesn't patch up prev/next links. 471 */ 472 void FreeNode( TidyDocImpl* doc, Node *node ); 473 474 Node* TextToken( Lexer *lexer ); 475 476 /* used for creating preformatted text from Word2000 */ 477 Node *NewLineNode( Lexer *lexer ); 478 479 /* used for adding a &nbsp; for Word2000 */ 480 Node *NewLiteralTextNode(Lexer *lexer, ctmbstr txt ); 481 482 Node* CommentToken(Lexer *lexer); 483 Node* GetCDATA( TidyDocImpl* doc, Node *container ); 484 485 void AddByte( Lexer *lexer, tmbchar c ); 486 void AddStringLiteral( Lexer* lexer, ctmbstr str ); 487 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); 488 489 /* find element */ 490 Node* FindDocType( TidyDocImpl* doc ); 491 Node* FindHTML( TidyDocImpl* doc ); 492 Node* FindHEAD( TidyDocImpl* doc ); 493 Node* FindTITLE(TidyDocImpl* doc); 494 Node* FindBody( TidyDocImpl* doc ); 495 Node* FindXmlDecl(TidyDocImpl* doc); 496 497 /* Returns containing block element, if any */ 498 Node* FindContainer( Node* node ); 499 500 /* add meta element for Tidy */ 501 Bool AddGenerator( TidyDocImpl* doc ); 502 503 /* examine <!DOCTYPE> to identify version */ 504 uint FindGivenVersion( TidyDocImpl* doc, Node* doctype ); 505 uint ApparentVersion( TidyDocImpl* doc ); 506 507 508 Bool CheckDocTypeKeyWords(Lexer *lexer, Node *doctype); 509 510 ctmbstr HTMLVersionName( TidyDocImpl* doc ); 511 ctmbstr HTMLVersionNameFromCode( uint vers, Bool isXhtml ); 512 513 Bool SetXHTMLDocType( TidyDocImpl* doc ); 514 515 516 /* fixup doctype if missing */ 517 Bool FixDocType( TidyDocImpl* doc ); 518 519 /* ensure XML document starts with <?xml version="1.0"?> */ 520 /* add encoding attribute if not using ASCII or UTF-8 output */ 521 Bool FixXmlDecl( TidyDocImpl* doc ); 522 523 Node* InferredTag(TidyDocImpl* doc, TidyTagId id); 524 525 Bool ExpectsContent(Node *node); 526 527 528 void UngetToken( TidyDocImpl* doc ); 529 530 531 /* 532 modes for GetToken() 533 534 MixedContent -- for elements which don't accept PCDATA 535 Preformatted -- white space preserved as is 536 IgnoreMarkup -- for CDATA elements such as script, style 537 */ 538 #define IgnoreWhitespace 0 539 #define MixedContent 1 540 #define Preformatted 2 541 #define IgnoreMarkup 3 542 #define CdataContent 4 543 544 Node* GetToken( TidyDocImpl* doc, uint mode ); 545 546 void InitMap(void); 547 548 Bool IsValidAttrName( ctmbstr attr ); 549 550 551 /* create a new attribute */ 552 AttVal *NewAttribute(void); 553 554 /* create a new attribute with given name and value */ 555 AttVal *NewAttributeEx( TidyDocImpl* doc, ctmbstr name, ctmbstr value, 556 int delim ); 557 558 /* insert attribute at the end of attribute list of a node */ 559 void InsertAttributeAtEnd( Node *node, AttVal *av ); 560 561 /* insert attribute at the start of attribute list of a node */ 562 void InsertAttributeAtStart( Node *node, AttVal *av ); 563 564 /************************************* 565 In-line Stack functions 566 *************************************/ 567 568 569 /* duplicate attributes */ 570 AttVal* DupAttrs( TidyDocImpl* doc, AttVal* attrs ); 571 572 /* 573 push a copy of an inline node onto stack 574 but don't push if implicit or OBJECT or APPLET 575 (implicit tags are ones generated from the istack) 576 577 One issue arises with pushing inlines when 578 the tag is already pushed. For instance: 579 580 <p><em>text 581 <p><em>more text 582 583 Shouldn't be mapped to 584 585 <p><em>text</em></p> 586 <p><em><em>more text</em></em> 587 */ 588 void PushInline( TidyDocImpl* doc, Node* node ); 589 590 /* pop inline stack */ 591 void PopInline( TidyDocImpl* doc, Node* node ); 592 593 Bool IsPushed( TidyDocImpl* doc, Node* node ); 594 595 /* 596 This has the effect of inserting "missing" inline 597 elements around the contents of blocklevel elements 598 such as P, TD, TH, DIV, PRE etc. This procedure is 599 called at the start of ParseBlock. when the inline 600 stack is not empty, as will be the case in: 601 602 <i><h1>italic heading</h1></i> 603 604 which is then treated as equivalent to 605 606 <h1><i>italic heading</i></h1> 607 608 This is implemented by setting the lexer into a mode 609 where it gets tokens from the inline stack rather than 610 from the input stream. 611 */ 612 int InlineDup( TidyDocImpl* doc, Node *node ); 613 614 /* 615 defer duplicates when entering a table or other 616 element where the inlines shouldn't be duplicated 617 */ 618 void DeferDup( TidyDocImpl* doc ); 619 Node *InsertedToken( TidyDocImpl* doc ); 620 621 #ifdef __cplusplus 622 } 623 #endif 624 625 626 #endif /* __LEXER_H__ */ 627

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.