Version:
~ [ 1.0 ] ~
1 #ifndef __LEXER_H__
2 #define __LEXER_H__
3
4 /* lexer.h -- Lexer for html parser
5
6 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
7 See tidy.h for the copyright notice.
8
9 CVS Info:
10 $Author: arnaud02 $
11 $Date: 2005/08/26 16:08:45 $
12 $Revision: 1.30 $
13
14 */
15
16 /*
17 Given an input source, it returns a sequence of tokens.
18
19 GetToken(source) gets the next token
20 UngetToken(source) provides one level undo
21
22 The tags include an attribute list:
23
24 - linked list of attribute/value nodes
25 - each node has 2 NULL-terminated strings.
26 - entities are replaced in attribute values
27
28 white space is compacted if not in preformatted mode
29 If not in preformatted mode then leading white space
30 is discarded and subsequent white space sequences
31 compacted to single space characters.
32
33 If XmlTags is no then Tag names are folded to upper
34 case and attribute names to lower case.
35
36 Not yet done:
37 - Doctype subset and marked sections
38 */
39
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43
44 #include "forward.h"
45
46 /* lexer character types
47 */
48 #define digit 1
49 #define letter 2
50 #define namechar 4
51 #define white 8
52 #define newline 16
53 #define lowercase 32
54 #define uppercase 64
55
56
57 /* node->type is one of these values
58 */
59 typedef enum
60 {
61 RootNode,
62 DocTypeTag,
63 CommentTag,
64 ProcInsTag,
65 TextNode,
66 StartTag,
67 EndTag,
68 StartEndTag,
69 CDATATag,
70 SectionTag,
71 AspTag,
72 JsteTag,
73 PhpTag,
74 XmlDecl
75 } NodeType;
76
77
78
79 /* lexer GetToken states
80 */
81 typedef enum
82 {
83 LEX_CONTENT,
84 LEX_GT,
85 LEX_ENDTAG,
86 LEX_STARTTAG,
87 LEX_COMMENT,
88 LEX_DOCTYPE,
89 LEX_PROCINSTR,
90 LEX_ENDCOMMENT,
91 LEX_CDATA,
92 LEX_SECTION,
93 LEX_ASP,
94 LEX_JSTE,
95 LEX_PHP,
96 LEX_XMLDECL
97 } LexerState;
98
99 /* ParseDocTypeDecl state constants */
100 typedef enum
101 {
102 DT_INTERMEDIATE,
103 DT_DOCTYPENAME,
104 DT_PUBLICSYSTEM,
105 DT_QUOTEDSTRING,
106 DT_INTSUBSET
107 } ParseDocTypeDeclState;
108
109 /* content model shortcut encoding
110
111 Descriptions are tentative.
112 */
113 #define CM_UNKNOWN 0
114 /* Elements with no content. Map to HTML specification. */
115 #define CM_EMPTY (1 << 0)
116 /* Elements that appear outside of "BODY". */
117 #define CM_HTML (1 << 1)
118 /* Elements that can appear within HEAD. */
119 #define CM_HEAD (1 << 2)
120 /* HTML "block" elements. */
121 #define CM_BLOCK (1 << 3)
122 /* HTML "inline" elements. */
123 #define CM_INLINE (1 << 4)
124 /* Elements that mark list item ("LI"). */
125 #define CM_LIST (1 << 5)
126 /* Elements that mark definition list item ("DL", "DT"). */
127 #define CM_DEFLIST (1 << 6)
128 /* Elements that can appear inside TABLE. */
129 #define CM_TABLE (1 << 7)
130 /* Used for "THEAD", "TFOOT" or "TBODY". */
131 #define CM_ROWGRP (1 << 8)
132 /* Used for "TD", "TH" */
133 #define CM_ROW (1 << 9)
134 /* Elements whose content must be protected against white space movement.
135 Includes some elements that can found in forms. */
136 #define CM_FIELD (1 << 10)
137 /* Used to avoid propagating inline emphasis inside some elements
138 such as OBJECT or APPLET. */
139 #define CM_OBJECT (1 << 11)
140 /* Elements that allows "PARAM". */
141 #define CM_PARAM (1 << 12)
142 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
143 #define CM_FRAMES (1 << 13)
144 /* Heading elements (h1, h2, ...). */
145 #define CM_HEADING (1 << 14)
146 /* Elements with an optional end tag. */
147 #define CM_OPT (1 << 15)
148 /* Elements that use "align" attribute for vertical position. */
149 #define CM_IMG (1 << 16)
150 /* Elements with inline and block model. Used to avoid calling InlineDup. */
151 #define CM_MIXED (1 << 17)
152 /* Elements whose content needs to be indented only if containing one
153 CM_BLOCK element. */
154 #define CM_NO_INDENT (1 << 18)
155 /* Elements that are obsolete (such as "dir", "menu"). */
156 #define CM_OBSOLETE (1 << 19)
157 /* User defined elements. Used to determine how attributes wihout value
158 should be printed. */
159 #define CM_NEW (1 << 20)
160 /* Elements that cannot be omitted. */
161 #define CM_OMITST (1 << 21)
162
163 /* If the document uses just HTML 2.0 tags and attributes described
164 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
165 ** If there are proprietary tags and attributes then describe it as
166 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes
167 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
168 ** flavors of Voyager (strict, loose or frameset).
169 */
170
171 /* unknown */
172 #define xxxx 0u
173
174 /* W3C defined HTML/XHTML family document types */
175 #define HT20 1u
176 #define HT32 2u
177 #define H40S 4u
178 #define H40T 8u
179 #define H40F 16u
180 #define H41S 32u
181 #define H41T 64u
182 #define H41F 128u
183 #define X10S 256u
184 #define X10T 512u
185 #define X10F 1024u
186 #define XH11 2048u
187 #define XB10 4096u
188
189 /* proprietary stuff */
190 #define VERS_SUN 8192u
191 #define VERS_NETSCAPE 16384u
192 #define VERS_MICROSOFT 32768u
193
194 /* special flag */
195 #define VERS_XML 65536u
196
197 /* compatibility symbols */
198 #define VERS_UNKNOWN (xxxx)
199 #define VERS_HTML20 (HT20)
200 #define VERS_HTML32 (HT32)
201 #define VERS_HTML40_STRICT (H40S|H41S|X10S)
202 #define VERS_HTML40_LOOSE (H40T|H41T|X10T)
203 #define VERS_FRAMESET (H40F|H41F|X10F)
204 #define VERS_XHTML11 (XH11)
205 #define VERS_BASIC (XB10)
206
207 /* meta symbols */
208 #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
209 #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)
210 #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
211 #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)
212 #define VERS_FROM32 (VERS_HTML32|VERS_HTML40)
213 #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
214 #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10)
215
216 /* all W3C defined document types */
217 #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40)
218
219 /* all proprietary types */
220 #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
221
222 /* Linked list of class names and styles
223 */
224 struct _Style;
225 typedef struct _Style TagStyle;
226
227 struct _Style
228 {
229 tmbstr tag;
230 tmbstr tag_class;
231 tmbstr properties;
232 TagStyle *next;
233 };
234
235
236 /* Linked list of style properties
237 */
238 struct _StyleProp;
239 typedef struct _StyleProp StyleProp;
240
241 struct _StyleProp
242 {
243 tmbstr name;
244 tmbstr value;
245 StyleProp *next;
246 };
247
248
249
250
251 /* Attribute/Value linked list node
252 */
253
254 struct _AttVal
255 {
256 AttVal* next;
257 const Attribute* dict;
258 Node* asp;
259 Node* php;
260 int delim;
261 tmbstr attribute;
262 tmbstr value;
263 };
264
265
266
267 /*
268 Mosaic handles inlines via a separate stack from other elements
269 We duplicate this to recover from inline markup errors such as:
270
271 <i>italic text
272 <p>more italic text</b> normal text
273
274 which for compatibility with Mosaic is mapped to:
275
276 <i>italic text</i>
277 <p><i>more italic text</i> normal text
278
279 Note that any inline end tag pop's the effect of the current
280 inline start tag, so that </b> pop's <i> in the above example.
281 */
282 struct _IStack
283 {
284 IStack* next;
285 const Dict* tag; /* tag's dictionary definition */
286 tmbstr element; /* name (NULL for text nodes) */
287 AttVal* attributes;
288 };
289
290
291 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
292 ** etc. etc.
293 */
294
295 struct _Node
296 {
297 Node* parent; /* tree structure */
298 Node* prev;
299 Node* next;
300 Node* content;
301 Node* last;
302
303 AttVal* attributes;
304 const Dict* was; /* old tag when it was changed */
305 const Dict* tag; /* tag's dictionary definition */
306
307 tmbstr element; /* name (NULL for text nodes) */
308
309 uint start; /* start of span onto text array */
310 uint end; /* end of span onto text array */
311 NodeType type; /* TextNode, StartTag, EndTag etc. */
312
313 uint line; /* current line of document */
314 uint column; /* current column of document */
315
316 Bool closed; /* true if closed by explicit end tag */
317 Bool implicit; /* true if inferred */
318 Bool linebreak; /* true if followed by a line break */
319
320 #ifdef TIDY_STORE_ORIGINAL_TEXT
321 tmbstr otext;
322 #endif
323 };
324
325
326 /*
327 The following are private to the lexer
328 Use NewLexer() to create a lexer, and
329 FreeLexer() to free it.
330 */
331
332 struct _Lexer
333 {
334 #if 0 /* Move to TidyDocImpl */
335 StreamIn* in; /* document content input */
336 StreamOut* errout; /* error output stream */
337
338 uint badAccess; /* for accessibility errors */
339 uint badLayout; /* for bad style errors */
340 uint badChars; /* for bad character encodings */
341 uint badForm; /* for mismatched/mispositioned form tags */
342 uint warnings; /* count of warnings in this document */
343 uint errors; /* count of errors */
344 #endif
345
346 uint lines; /* lines seen */
347 uint columns; /* at start of current token */
348 Bool waswhite; /* used to collapse contiguous white space */
349 Bool pushed; /* true after token has been pushed back */
350 Bool insertspace; /* when space is moved after end tag */
351 Bool excludeBlocks; /* Netscape compatibility */
352 Bool exiled; /* true if moved out of table */
353 Bool isvoyager; /* true if xmlns attribute on html element */
354 uint versions; /* bit vector of HTML versions */
355 uint doctype; /* version as given by doctype (if any) */
356 uint versionEmitted; /* version of doctype emitted */
357 Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
358 uint txtstart; /* start of current node */
359 uint txtend; /* end of current node */
360 LexerState state; /* state of lexer's finite state machine */
361
362 Node* token; /* current parse point */
363 Node* root; /* remember root node of the document */
364 Node* parent; /* remember parent node for CDATA elements */
365
366 Bool seenEndBody; /* true if a </body> tag has been encountered */
367 Bool seenEndHtml; /* true if a </html> tag has been encountered */
368
369 /*
370 Lexer character buffer
371
372 Parse tree nodes span onto this buffer
373 which contains the concatenated text
374 contents of all of the elements.
375
376 lexsize must be reset for each file.
377 */
378 tmbstr lexbuf; /* MB character buffer */
379 uint lexlength; /* allocated */
380 uint lexsize; /* used */
381
382 /* Inline stack for compatibility with Mosaic */
383 Node* inode; /* for deferring text node */
384 IStack* insert; /* for inferring inline tags */
385 IStack* istack;
386 uint istacklength; /* allocated */
387 uint istacksize; /* used */
388 uint istackbase; /* start of frame */
389
390 TagStyle *styles; /* used for cleaning up presentation markup */
391
392 #if 0
393 TidyDocImpl* doc; /* Pointer back to doc for error reporting */
394 #endif
395 };
396
397
398 /* Lexer Functions
399 */
400 Node *CommentToken( Lexer *lexer );
401
402 /* choose what version to use for new doctype */
403 int HTMLVersion( TidyDocImpl* doc );
404
405 ctmbstr GetFPIFromVers(uint vers);
406
407 /* everything is allowed in proprietary version of HTML */
408 /* this is handled here rather than in the tag/attr dicts */
409
410 void ConstrainVersion( TidyDocImpl* doc, uint vers );
411
412 Bool IsWhite(uint c);
413 Bool IsDigit(uint c);
414 Bool IsLetter(uint c);
415 Bool IsNewline(uint c);
416 Bool IsNamechar(uint c);
417 Bool IsXMLLetter(uint c);
418 Bool IsXMLNamechar(uint c);
419
420 Bool IsLower(uint c);
421 Bool IsUpper(uint c);
422 uint ToLower(uint c);
423 uint ToUpper(uint c);
424
425 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps );
426
427
428 Lexer* NewLexer( TidyDocImpl* doc );
429 Bool EndOfInput( TidyDocImpl* doc );
430 void FreeLexer( TidyDocImpl* doc );
431
432 /* store character c as UTF-8 encoded byte stream */
433 void AddCharToLexer( Lexer *lexer, uint c );
434
435 /*
436 Used for elements and text nodes
437 element name is NULL for text nodes
438 start and end are offsets into lexbuf
439 which contains the textual content of
440 all elements in the parse tree.
441
442 parent and content allow traversal
443 of the parse tree in any direction.
444 attributes are represented as a linked
445 list of AttVal nodes which hold the
446 strings for attribute/value pairs.
447 */
448 Node* NewNode( Lexer* lexer );
449
450
451 /* used to clone heading nodes when split by an <HR> */
452 Node *CloneNode( TidyDocImpl* doc, Node *element );
453
454 /* free node's attributes */
455 void FreeAttrs( TidyDocImpl* doc, Node *node );
456
457 /* doesn't repair attribute list linkage */
458 void FreeAttribute( TidyDocImpl* doc, AttVal *av );
459
460 /* detach attribute from node */
461 void DetachAttribute( Node *node, AttVal *attr );
462
463 /* detach attribute from node then free it
464 */
465 void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr );
466
467 /*
468 Free document nodes by iterating through peers and recursing
469 through children. Set next to NULL before calling FreeNode()
470 to avoid freeing peer nodes. Doesn't patch up prev/next links.
471 */
472 void FreeNode( TidyDocImpl* doc, Node *node );
473
474 Node* TextToken( Lexer *lexer );
475
476 /* used for creating preformatted text from Word2000 */
477 Node *NewLineNode( Lexer *lexer );
478
479 /* used for adding a for Word2000 */
480 Node *NewLiteralTextNode(Lexer *lexer, ctmbstr txt );
481
482 Node* CommentToken(Lexer *lexer);
483 Node* GetCDATA( TidyDocImpl* doc, Node *container );
484
485 void AddByte( Lexer *lexer, tmbchar c );
486 void AddStringLiteral( Lexer* lexer, ctmbstr str );
487 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len );
488
489 /* find element */
490 Node* FindDocType( TidyDocImpl* doc );
491 Node* FindHTML( TidyDocImpl* doc );
492 Node* FindHEAD( TidyDocImpl* doc );
493 Node* FindTITLE(TidyDocImpl* doc);
494 Node* FindBody( TidyDocImpl* doc );
495 Node* FindXmlDecl(TidyDocImpl* doc);
496
497 /* Returns containing block element, if any */
498 Node* FindContainer( Node* node );
499
500 /* add meta element for Tidy */
501 Bool AddGenerator( TidyDocImpl* doc );
502
503 /* examine <!DOCTYPE> to identify version */
504 uint FindGivenVersion( TidyDocImpl* doc, Node* doctype );
505 uint ApparentVersion( TidyDocImpl* doc );
506
507
508 Bool CheckDocTypeKeyWords(Lexer *lexer, Node *doctype);
509
510 ctmbstr HTMLVersionName( TidyDocImpl* doc );
511 ctmbstr HTMLVersionNameFromCode( uint vers, Bool isXhtml );
512
513 Bool SetXHTMLDocType( TidyDocImpl* doc );
514
515
516 /* fixup doctype if missing */
517 Bool FixDocType( TidyDocImpl* doc );
518
519 /* ensure XML document starts with <?xml version="1.0"?> */
520 /* add encoding attribute if not using ASCII or UTF-8 output */
521 Bool FixXmlDecl( TidyDocImpl* doc );
522
523 Node* InferredTag(TidyDocImpl* doc, TidyTagId id);
524
525 Bool ExpectsContent(Node *node);
526
527
528 void UngetToken( TidyDocImpl* doc );
529
530
531 /*
532 modes for GetToken()
533
534 MixedContent -- for elements which don't accept PCDATA
535 Preformatted -- white space preserved as is
536 IgnoreMarkup -- for CDATA elements such as script, style
537 */
538 #define IgnoreWhitespace 0
539 #define MixedContent 1
540 #define Preformatted 2
541 #define IgnoreMarkup 3
542 #define CdataContent 4
543
544 Node* GetToken( TidyDocImpl* doc, uint mode );
545
546 void InitMap(void);
547
548 Bool IsValidAttrName( ctmbstr attr );
549
550
551 /* create a new attribute */
552 AttVal *NewAttribute(void);
553
554 /* create a new attribute with given name and value */
555 AttVal *NewAttributeEx( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
556 int delim );
557
558 /* insert attribute at the end of attribute list of a node */
559 void InsertAttributeAtEnd( Node *node, AttVal *av );
560
561 /* insert attribute at the start of attribute list of a node */
562 void InsertAttributeAtStart( Node *node, AttVal *av );
563
564 /*************************************
565 In-line Stack functions
566 *************************************/
567
568
569 /* duplicate attributes */
570 AttVal* DupAttrs( TidyDocImpl* doc, AttVal* attrs );
571
572 /*
573 push a copy of an inline node onto stack
574 but don't push if implicit or OBJECT or APPLET
575 (implicit tags are ones generated from the istack)
576
577 One issue arises with pushing inlines when
578 the tag is already pushed. For instance:
579
580 <p><em>text
581 <p><em>more text
582
583 Shouldn't be mapped to
584
585 <p><em>text</em></p>
586 <p><em><em>more text</em></em>
587 */
588 void PushInline( TidyDocImpl* doc, Node* node );
589
590 /* pop inline stack */
591 void PopInline( TidyDocImpl* doc, Node* node );
592
593 Bool IsPushed( TidyDocImpl* doc, Node* node );
594
595 /*
596 This has the effect of inserting "missing" inline
597 elements around the contents of blocklevel elements
598 such as P, TD, TH, DIV, PRE etc. This procedure is
599 called at the start of ParseBlock. when the inline
600 stack is not empty, as will be the case in:
601
602 <i><h1>italic heading</h1></i>
603
604 which is then treated as equivalent to
605
606 <h1><i>italic heading</i></h1>
607
608 This is implemented by setting the lexer into a mode
609 where it gets tokens from the inline stack rather than
610 from the input stream.
611 */
612 int InlineDup( TidyDocImpl* doc, Node *node );
613
614 /*
615 defer duplicates when entering a table or other
616 element where the inlines shouldn't be duplicated
617 */
618 void DeferDup( TidyDocImpl* doc );
619 Node *InsertedToken( TidyDocImpl* doc );
620
621 #ifdef __cplusplus
622 }
623 #endif
624
625
626 #endif /* __LEXER_H__ */
627
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.