Version:
~ [ 1.0 ] ~
** Warning: Cannot open xref database.
1 /*
2 clean.c -- clean up misuse of presentation markup
3
4 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
5 See tidy.h for the copyright notice.
6
7 CVS Info :
8
9 $Author: arnaud02 $
10 $Date: 2005/08/03 18:06:59 $
11 $Revision: 1.98 $
12
13 Filters from other formats such as Microsoft Word
14 often make excessive use of presentation markup such
15 as font tags, B, I, and the align attribute. By applying
16 a set of production rules, it is straight forward to
17 transform this to use CSS.
18
19 Some rules replace some of the children of an element by
20 style properties on the element, e.g.
21
22 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
23
24 Such rules are applied to the element's content and then
25 to the element itself until none of the rules more apply.
26 Having applied all the rules to an element, it will have
27 a style attribute with one or more properties.
28
29 Other rules strip the element they apply to, replacing
30 it by style properties on the contents, e.g.
31
32 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
33
34 These rules are applied to an element before processing
35 its content and replace the current element by the first
36 element in the exposed content.
37
38 After applying both sets of rules, you can replace the
39 style attribute by a class value and style rule in the
40 document head. To support this, an association of styles
41 and class names is built.
42
43 A naive approach is to rely on string matching to test
44 when two property lists are the same. A better approach
45 would be to first sort the properties before matching.
46
47 */
48
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52
53 #include "tidy-int.h"
54 #include "clean.h"
55 #include "lexer.h"
56 #include "parser.h"
57 #include "attrs.h"
58 #include "message.h"
59 #include "tmbstr.h"
60 #include "utf8.h"
61
62 void RenameElem( Node* node, TidyTagId tid )
63 {
64 const Dict* dict = LookupTagDef( tid );
65 MemFree( node->element );
66 node->element = tmbstrdup( dict->name );
67 node->tag = dict;
68 }
69
70 static void FreeStyleProps(StyleProp *props)
71 {
72 StyleProp *next;
73
74 while (props)
75 {
76 next = props->next;
77 MemFree(props->name);
78 MemFree(props->value);
79 MemFree(props);
80 props = next;
81 }
82 }
83
84 static StyleProp *InsertProperty( StyleProp* props, ctmbstr name, ctmbstr value )
85 {
86 StyleProp *first, *prev, *prop;
87 int cmp;
88
89 prev = NULL;
90 first = props;
91
92 while (props)
93 {
94 cmp = tmbstrcmp(props->name, name);
95
96 if (cmp == 0)
97 {
98 /* this property is already defined, ignore new value */
99 return first;
100 }
101
102 if (cmp > 0)
103 {
104 /* insert before this */
105
106 prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
107 prop->name = tmbstrdup(name);
108 prop->value = tmbstrdup(value);
109 prop->next = props;
110
111 if (prev)
112 prev->next = prop;
113 else
114 first = prop;
115
116 return first;
117 }
118
119 prev = props;
120 props = props->next;
121 }
122
123 prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
124 prop->name = tmbstrdup(name);
125 prop->value = tmbstrdup(value);
126 prop->next = NULL;
127
128 if (prev)
129 prev->next = prop;
130 else
131 first = prop;
132
133 return first;
134 }
135
136 /*
137 Create sorted linked list of properties from style string
138 It temporarily places nulls in place of ':' and ';' to
139 delimit the strings for the property name and value.
140 Some systems don't allow you to NULL literal strings,
141 so to avoid this, a copy is made first.
142 */
143 static StyleProp* CreateProps( StyleProp* prop, ctmbstr style )
144 {
145 tmbstr name, value = NULL, name_end, value_end, line;
146 Bool more;
147
148 line = tmbstrdup(style);
149 name = line;
150
151 while (*name)
152 {
153 while (*name == ' ')
154 ++name;
155
156 name_end = name;
157
158 while (*name_end)
159 {
160 if (*name_end == ':')
161 {
162 value = name_end + 1;
163 break;
164 }
165
166 ++name_end;
167 }
168
169 if (*name_end != ':')
170 break;
171
172 while ( value && *value == ' ')
173 ++value;
174
175 value_end = value;
176 more = no;
177
178 while (*value_end)
179 {
180 if (*value_end == ';')
181 {
182 more = yes;
183 break;
184 }
185
186 ++value_end;
187 }
188
189 *name_end = '\0';
190 *value_end = '\0';
191
192 prop = InsertProperty(prop, name, value);
193 *name_end = ':';
194
195 if (more)
196 {
197 *value_end = ';';
198 name = value_end + 1;
199 continue;
200 }
201
202 break;
203 }
204
205 MemFree(line); /* free temporary copy */
206 return prop;
207 }
208
209 static tmbstr CreatePropString(StyleProp *props)
210 {
211 tmbstr style, p, s;
212 uint len;
213 StyleProp *prop;
214
215 /* compute length */
216
217 for (len = 0, prop = props; prop; prop = prop->next)
218 {
219 len += tmbstrlen(prop->name) + 2;
220 if (prop->value)
221 len += tmbstrlen(prop->value) + 2;
222 }
223
224 style = (tmbstr) MemAlloc(len+1);
225 style[0] = '\0';
226
227 for (p = style, prop = props; prop; prop = prop->next)
228 {
229 s = prop->name;
230
231 while((*p++ = *s++))
232 continue;
233
234 if (prop->value)
235 {
236 *--p = ':';
237 *++p = ' ';
238 ++p;
239
240 s = prop->value;
241 while((*p++ = *s++))
242 continue;
243 }
244 if (prop->next == NULL)
245 break;
246
247 *--p = ';';
248 *++p = ' ';
249 ++p;
250 }
251
252 return style;
253 }
254
255 /*
256 create string with merged properties
257 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
258 {
259 tmbstr line;
260 StyleProp *prop;
261
262 prop = CreateProps(NULL, style);
263 prop = CreateProps(prop, property);
264 line = CreatePropString(prop);
265 FreeStyleProps(prop);
266 return line;
267 }
268 */
269
270 void FreeStyles( TidyDocImpl* doc )
271 {
272 Lexer* lexer = doc->lexer;
273 if ( lexer )
274 {
275 TagStyle *style, *next;
276 for ( style = lexer->styles; style; style = next )
277 {
278 next = style->next;
279 MemFree( style->tag );
280 MemFree( style->tag_class );
281 MemFree( style->properties );
282 MemFree( style );
283 }
284 }
285 }
286
287 static tmbstr GensymClass( TidyDocImpl* doc )
288 {
289 tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
290 ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
291 if ( pfx == NULL || *pfx == 0 )
292 pfx = "c";
293
294 tmbsnprintf(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
295 return tmbstrdup(buf);
296 }
297
298 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
299 {
300 Lexer* lexer = doc->lexer;
301 TagStyle* style;
302
303 for (style = lexer->styles; style; style=style->next)
304 {
305 if (tmbstrcmp(style->tag, tag) == 0 &&
306 tmbstrcmp(style->properties, properties) == 0)
307 return style->tag_class;
308 }
309
310 style = (TagStyle *)MemAlloc( sizeof(TagStyle) );
311 style->tag = tmbstrdup(tag);
312 style->tag_class = GensymClass( doc );
313 style->properties = tmbstrdup( properties );
314 style->next = lexer->styles;
315 lexer->styles = style;
316 return style->tag_class;
317 }
318
319 /*
320 Add class="foo" to node
321 */
322 void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
323 {
324 AttVal *classattr = AttrGetById(node, TidyAttr_CLASS);;
325
326 /*
327 if there already is a class attribute
328 then append class name after a space.
329 */
330 if (classattr)
331 {
332 uint len = tmbstrlen(classattr->value) +
333 tmbstrlen(classname) + 2;
334 tmbstr s = (tmbstr) MemAlloc( len );
335 tmbstrcpy( s, classattr->value );
336 tmbstrcat( s, " " );
337 tmbstrcat( s, classname );
338 MemFree( classattr->value );
339 classattr->value = s;
340 }
341 else /* create new class attribute */
342 AddAttribute( doc, node, "class", classname );
343 }
344
345
346 /*
347 Find style attribute in node, and replace it
348 by corresponding class attribute. Search for
349 class in style dictionary otherwise gensym
350 new class and add to dictionary.
351
352 Assumes that node doesn't have a class attribute
353 */
354 static void Style2Rule( TidyDocImpl* doc, Node *node)
355 {
356 AttVal *styleattr, *classattr;
357 ctmbstr classname;
358
359 styleattr = AttrGetById(node, TidyAttr_STYLE);
360
361 if (styleattr)
362 {
363 /* fix for http://tidy.sf.net/bug/850215 */
364 if (!styleattr->value)
365 {
366 RemoveAttribute(doc, node, styleattr);
367 return;
368 }
369
370 classname = FindStyle( doc, node->element, styleattr->value );
371 classattr = AttrGetById(node, TidyAttr_CLASS);
372
373 /*
374 if there already is a class attribute
375 then append class name after an underscore
376 */
377 if (classattr)
378 {
379 uint len = tmbstrlen(classattr->value) +
380 tmbstrlen(classname) + 2;
381 tmbstr s = (tmbstr) MemAlloc( len );
382 s[0] = '\0';
383 if (classattr->value)
384 {
385 tmbstrcpy(s, classattr->value);
386 tmbstrcat(s, " ");
387 }
388 tmbstrcat(s, classname);
389 if (classattr->value)
390 MemFree(classattr->value);
391 classattr->value = s;
392 RemoveAttribute( doc, node, styleattr );
393 }
394 else /* reuse style attribute for class attribute */
395 {
396 MemFree(styleattr->attribute);
397 MemFree(styleattr->value);
398 styleattr->attribute = tmbstrdup("class");
399 styleattr->value = tmbstrdup(classname);
400 }
401 }
402 }
403
404 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
405 {
406 if ( selector && color )
407 {
408 AddStringLiteral(lexer, selector);
409 AddStringLiteral(lexer, " { color: ");
410 AddStringLiteral(lexer, color);
411 AddStringLiteral(lexer, " }\n");
412 }
413 }
414
415 /*
416 move presentation attribs from body to style element
417
418 background="foo" -> body { background-image: url(foo) }
419 bgcolor="foo" -> body { background-color: foo }
420 text="foo" -> body { color: foo }
421 link="foo" -> :link { color: foo }
422 vlink="foo" -> :visited { color: foo }
423 alink="foo" -> :active { color: foo }
424 */
425 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
426 {
427 Lexer* lexer = doc->lexer;
428 tmbstr bgurl = NULL;
429 tmbstr bgcolor = NULL;
430 tmbstr color = NULL;
431 AttVal* attr;
432
433 if (NULL != (attr = AttrGetById(body, TidyAttr_BACKGROUND)))
434 {
435 bgurl = attr->value;
436 attr->value = NULL;
437 RemoveAttribute( doc, body, attr );
438 }
439
440 if (NULL != (attr = AttrGetById(body, TidyAttr_BGCOLOR)))
441 {
442 bgcolor = attr->value;
443 attr->value = NULL;
444 RemoveAttribute( doc, body, attr );
445 }
446
447 if (NULL != (attr = AttrGetById(body, TidyAttr_TEXT)))
448 {
449 color = attr->value;
450 attr->value = NULL;
451 RemoveAttribute( doc, body, attr );
452 }
453
454 if ( bgurl || bgcolor || color )
455 {
456 AddStringLiteral(lexer, " body {\n");
457 if (bgurl)
458 {
459 AddStringLiteral(lexer, " background-image: url(");
460 AddStringLiteral(lexer, bgurl);
461 AddStringLiteral(lexer, ");\n");
462 MemFree(bgurl);
463 }
464 if (bgcolor)
465 {
466 AddStringLiteral(lexer, " background-color: ");
467 AddStringLiteral(lexer, bgcolor);
468 AddStringLiteral(lexer, ";\n");
469 MemFree(bgcolor);
470 }
471 if (color)
472 {
473 AddStringLiteral(lexer, " color: ");
474 AddStringLiteral(lexer, color);
475 AddStringLiteral(lexer, ";\n");
476 MemFree(color);
477 }
478
479 AddStringLiteral(lexer, " }\n");
480 }
481
482 if (NULL != (attr = AttrGetById(body, TidyAttr_LINK)))
483 {
484 AddColorRule(lexer, " :link", attr->value);
485 RemoveAttribute( doc, body, attr );
486 }
487
488 if (NULL != (attr = AttrGetById(body, TidyAttr_VLINK)))
489 {
490 AddColorRule(lexer, " :visited", attr->value);
491 RemoveAttribute( doc, body, attr );
492 }
493
494 if (NULL != (attr = AttrGetById(body, TidyAttr_ALINK)))
495 {
496 AddColorRule(lexer, " :active", attr->value);
497 RemoveAttribute( doc, body, attr );
498 }
499 }
500
501 static Bool NiceBody( TidyDocImpl* doc )
502 {
503 Node* node = FindBody(doc);
504 if (node)
505 {
506 if (AttrGetById(node, TidyAttr_BACKGROUND) ||
507 AttrGetById(node, TidyAttr_BGCOLOR) ||
508 AttrGetById(node, TidyAttr_TEXT) ||
509 AttrGetById(node, TidyAttr_LINK) ||
510 AttrGetById(node, TidyAttr_VLINK) ||
511 AttrGetById(node, TidyAttr_ALINK))
512 {
513 doc->badLayout |= USING_BODY;
514 return no;
515 }
516 }
517
518 return yes;
519 }
520
521 /* create style element using rules from dictionary */
522 static void CreateStyleElement( TidyDocImpl* doc )
523 {
524 Lexer* lexer = doc->lexer;
525 Node *node, *head, *body;
526 TagStyle *style;
527 AttVal *av;
528
529 if ( lexer->styles == NULL && NiceBody(doc) )
530 return;
531
532 node = NewNode( lexer );
533 node->type = StartTag;
534 node->implicit = yes;
535 node->element = tmbstrdup("style");
536 FindTag( doc, node );
537
538 /* insert type attribute */
539 av = NewAttributeEx( doc, "type", "text/css", '"' );
540 InsertAttributeAtStart( node, av );
541
542 body = FindBody( doc );
543 lexer->txtstart = lexer->lexsize;
544 if ( body )
545 CleanBodyAttrs( doc, body );
546
547 for (style = lexer->styles; style; style = style->next)
548 {
549 AddCharToLexer(lexer, ' ');
550 AddStringLiteral(lexer, style->tag);
551 AddCharToLexer(lexer, '.');
552 AddStringLiteral(lexer, style->tag_class);
553 AddCharToLexer(lexer, ' ');
554 AddCharToLexer(lexer, '{');
555 AddStringLiteral(lexer, style->properties);
556 AddCharToLexer(lexer, '}');
557 AddCharToLexer(lexer, '\n');
558 }
559
560 lexer->txtend = lexer->lexsize;
561
562 InsertNodeAtEnd( node, TextToken(lexer) );
563
564 /*
565 now insert style element into document head
566
567 doc is root node. search its children for html node
568 the head node should be first child of html node
569 */
570 if ( NULL != (head = FindHEAD( doc )) )
571 InsertNodeAtEnd( head, node );
572 }
573
574
575 /* ensure bidirectional links are consistent */
576 void FixNodeLinks(Node *node)
577 {
578 Node *child;
579
580 if (node->prev)
581 node->prev->next = node;
582 else
583 node->parent->content = node;
584
585 if (node->next)
586 node->next->prev = node;
587 else
588 node->parent->last = node;
589
590 for (child = node->content; child; child = child->next)
591 child->parent = node;
592 }
593
594 /*
595 used to strip child of node when
596 the node has one and only one child
597 */
598 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
599 {
600 Node *child;
601
602 child = node->content;
603 node->content = child->content;
604 node->last = child->last;
605 child->content = NULL;
606 FreeNode(doc, child);
607
608 for (child = node->content; child; child = child->next)
609 child->parent = node;
610 }
611
612 /*
613 used to strip font start and end tags.
614 Extricate "element", replace it by its content and delete it.
615 */
616 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
617 {
618 if (element->content)
619 {
620 Node *node, *parent = element->parent;
621
622 element->last->next = element->next;
623
624 if (element->next)
625 {
626 element->next->prev = element->last;
627 }
628 else
629 parent->last = element->last;
630
631 if (element->prev)
632 {
633 element->content->prev = element->prev;
634 element->prev->next = element->content;
635 }
636 else
637 parent->content = element->content;
638
639 for (node = element->content; node; node = node->next)
640 node->parent = parent;
641
642 *pnode = element->content;
643
644 element->next = element->content = NULL;
645 FreeNode(doc, element);
646 }
647 else
648 {
649 *pnode = DiscardElement(doc, element);
650 }
651 }
652
653 /*
654 Create new string that consists of the
655 combined style properties in s1 and s2
656
657 To merge property lists, we build a linked
658 list of property/values and insert properties
659 into the list in order, merging values for
660 the same property name.
661 */
662 static tmbstr MergeProperties( ctmbstr s1, ctmbstr s2 )
663 {
664 tmbstr s;
665 StyleProp *prop;
666
667 prop = CreateProps(NULL, s1);
668 prop = CreateProps(prop, s2);
669 s = CreatePropString(prop);
670 FreeStyleProps(prop);
671 return s;
672 }
673
674 /*
675 Add style property to element, creating style
676 attribute as needed and adding ; delimiter
677 */
678 static void AddStyleProperty(TidyDocImpl* doc, Node *node, ctmbstr property )
679 {
680 AttVal *av = AttrGetById(node, TidyAttr_STYLE);
681
682 /* if style attribute already exists then insert property */
683
684 if ( av )
685 {
686 if (av->value != NULL)
687 {
688 tmbstr s = MergeProperties( av->value, property );
689 MemFree( av->value );
690 av->value = s;
691 }
692 else
693 {
694 av->value = tmbstrdup( property );
695 }
696 }
697 else /* else create new style attribute */
698 {
699 av = NewAttributeEx( doc, "style", property, '"' );
700 InsertAttributeAtStart( node, av );
701 }
702 }
703
704 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
705 {
706 AttVal *av;
707 tmbstr s1, s2, names;
708
709 for (s2 = NULL, av = child->attributes; av; av = av->next)
710 {
711 if (attrIsCLASS(av))
712 {
713 s2 = av->value;
714 break;
715 }
716 }
717
718 for (s1 = NULL, av = node->attributes; av; av = av->next)
719 {
720 if (attrIsCLASS(av))
721 {
722 s1 = av->value;
723 break;
724 }
725 }
726
727 if (s1)
728 {
729 if (s2) /* merge class names from both */
730 {
731 uint l1, l2;
732 l1 = tmbstrlen(s1);
733 l2 = tmbstrlen(s2);
734 names = (tmbstr) MemAlloc(l1 + l2 + 2);
735 tmbstrcpy(names, s1);
736 names[l1] = ' ';
737 tmbstrcpy(names+l1+1, s2);
738 MemFree(av->value);
739 av->value = names;
740 }
741 }
742 else if (s2) /* copy class names from child */
743 {
744 av = NewAttributeEx( doc, "class", s2, '"' );
745 InsertAttributeAtStart( node, av );
746 }
747 }
748
749 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
750 {
751 AttVal *av;
752 tmbstr s1, s2, style;
753
754 /*
755 the child may have a class attribute used
756 for attaching styles, if so the class name
757 needs to be copied to node's class
758 */
759 MergeClasses(doc, node, child);
760
761 for (s2 = NULL, av = child->attributes; av; av = av->next)
762 {
763 if (attrIsSTYLE(av))
764 {
765 s2 = av->value;
766 break;
767 }
768 }
769
770 for (s1 = NULL, av = node->attributes; av; av = av->next)
771 {
772 if (attrIsSTYLE(av))
773 {
774 s1 = av->value;
775 break;
776 }
777 }
778
779 if (s1)
780 {
781 if (s2) /* merge styles from both */
782 {
783 style = MergeProperties(s1, s2);
784 MemFree(av->value);
785 av->value = style;
786 }
787 }
788 else if (s2) /* copy style of child */
789 {
790 av = NewAttributeEx( doc, "style", s2, '"' );
791 InsertAttributeAtStart( node, av );
792 }
793 }
794
795 static ctmbstr FontSize2Name(ctmbstr size)
796 {
797 static const ctmbstr sizes[7] =
798 {
799 "60%", "70%", "80%", NULL,
800 "120%", "150%", "200%"
801 };
802
803 /* increment of 0.8 */
804 static const ctmbstr minussizes[] =
805 {
806 "100%", "80%", "64%", "51%",
807 "40%", "32%", "26%"
808 };
809
810 /* increment of 1.2 */
811 static const ctmbstr plussizes[] =
812 {
813 "100%", "120%", "144%", "172%",
814 "207%", "248%", "298%"
815 };
816
817 if (size[0] == '\0')
818 return NULL;
819
820 if ('' <= size[0] && size[0] <= '6')
821 {
822 int n = size[0] - '';
823 return sizes[n];
824 }
825
826 if (size[0] == '-')
827 {
828 if ('' <= size[1] && size[1] <= '6')
829 {
830 int n = size[1] - '';
831 return minussizes[n];
832 }
833 return "smaller"; /*"70%"; */
834 }
835
836 if ('' <= size[1] && size[1] <= '6')
837 {
838 int n = size[1] - '';
839 return plussizes[n];
840 }
841
842 return "larger"; /* "140%" */
843 }
844
845 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
846 {
847 tmbchar buf[256];
848 tmbsnprintf(buf, sizeof(buf), "font-family: %s", face );
849 AddStyleProperty( doc, node, buf );
850 }
851
852 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
853 {
854 ctmbstr value = NULL;
855
856 if (nodeIsP(node))
857 {
858 if (tmbstrcmp(size, "6") == 0)
859 value = "h1";
860 else if (tmbstrcmp(size, "5") == 0)
861 value = "h2";
862 else if (tmbstrcmp(size, "4") == 0)
863 value = "h3";
864
865 if (value)
866 {
867 MemFree(node->element);
868 node->element = tmbstrdup(value);
869 FindTag(doc, node);
870 return;
871 }
872 }
873
874 value = FontSize2Name(size);
875
876 if (value)
877 {
878 tmbchar buf[64];
879 tmbsnprintf(buf, sizeof(buf), "font-size: %s", value);
880 AddStyleProperty( doc, node, buf );
881 }
882 }
883
884 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
885 {
886 tmbchar buf[128];
887 tmbsnprintf(buf, sizeof(buf), "color: %s", color);
888 AddStyleProperty( doc, node, buf );
889 }
890
891 /* force alignment value to lower case */
892 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
893 {
894 uint i;
895 tmbchar buf[128];
896
897 tmbstrcpy( buf, "text-align: " );
898 for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
899 {
900 if ( (buf[i] = (tmbchar)ToLower(*align++)) == '\0' )
901 break;
902 }
903 buf[i] = '\0';
904 AddStyleProperty( doc, node, buf );
905 }
906
907 /*
908 add style properties to node corresponding to
909 the font face, size and color attributes
910 */
911 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
912 {
913 while (av)
914 {
915 if (AttrHasValue(av))
916 {
917 if (attrIsFACE(av))
918 AddFontFace( doc, node, av->value );
919 else if (attrIsSIZE(av))
920 AddFontSize( doc, node, av->value );
921 else if (attrIsCOLOR(av))
922 AddFontColor( doc, node, av->value );
923 }
924 av = av->next;
925 }
926 }
927
928 /*
929 Symptom: <p align=center>
930 Action: <p style="text-align: center">
931 */
932 static void TextAlign( TidyDocImpl* doc, Node* node )
933 {
934 AttVal *av, *prev;
935
936 prev = NULL;
937
938 for (av = node->attributes; av; av = av->next)
939 {
940 if (attrIsALIGN(av))
941 {
942 if (prev)
943 prev->next = av->next;
944 else
945 node->attributes = av->next;
946
947 if (av->value)
948 AddAlign( doc, node, av->value );
949
950 FreeAttribute(doc, av);
951 break;
952 }
953
954 prev = av;
955 }
956 }
957
958 /*
959 The clean up rules use the pnode argument to return the
960 next node when the original node has been deleted
961 */
962
963 /*
964 Symptom: <dir> <li> where <li> is only child
965 Action: coerce <dir> <li> to <div> with indent.
966 */
967
968 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
969 {
970 Node *child;
971
972 if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
973 {
974 child = node->content;
975
976 if (child == NULL)
977 return no;
978
979 /* check child has no peers */
980
981 if (child->next)
982 return no;
983
984 if ( !nodeIsLI(child) )
985 return no;
986
987 if ( !child->implicit )
988 return no;
989
990 /* coerce dir to div */
991 node->tag = LookupTagDef( TidyTag_DIV );
992 MemFree( node->element );
993 node->element = tmbstrdup("div");
994 AddStyleProperty( doc, node, "margin-left: 2em" );
995 StripOnlyChild( doc, node );
996 return yes;
997 }
998
999 return no;
1000 }
1001
1002 /*
1003 Symptom: <center>
1004 Action: replace <center> by <div style="text-align: center">
1005 */
1006
1007 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1008 {
1009 if ( nodeIsCENTER(node) )
1010 {
1011 if ( cfgBool(doc, TidyDropFontTags) )
1012 {
1013 if (node->content)
1014 {
1015 Node *last = node->last;
1016 DiscardContainer( doc, node, pnode );
1017
1018 node = InferredTag(doc, TidyTag_BR);
1019 InsertNodeAfterElement(last, node);
1020 }
1021 else
1022 {
1023 Node *prev = node->prev, *next = node->next,
1024 *parent = node->parent;
1025 DiscardContainer( doc, node, pnode );
1026
1027 node = InferredTag(doc, TidyTag_BR);
1028 if (next)
1029 InsertNodeBeforeElement(next, node);
1030 else if (prev)
1031 InsertNodeAfterElement(prev, node);
1032 else
1033 InsertNodeAtStart(parent, node);
1034 }
1035
1036 return yes;
1037 }
1038
1039 RenameElem( node, TidyTag_DIV );
1040 AddStyleProperty( doc, node, "text-align: center" );
1041 return yes;
1042 }
1043
1044 return no;
1045 }
1046
1047 /* Copy child attributes to node. Duplicate attributes are overwritten.
1048 Unique attributes (such as ID) disable the action.
1049 Attributes style and class are not dealt with. A call to MergeStyles
1050 will do that.
1051 */
1052 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1053 {
1054 AttVal *av1, *av2;
1055 TidyAttrId id;
1056
1057 /* Detect attributes that cannot be merged or overwritten. */
1058 if (AttrGetById(child, TidyAttr_ID) != NULL
1059 && AttrGetById(node, TidyAttr_ID) != NULL)
1060 return no;
1061
1062 /* Move child attributes to node. Attributes in node
1063 can be overwritten or merged. */
1064 for (av2 = child->attributes; av2; )
1065 {
1066 /* Dealt by MergeStyles. */
1067 if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1068 {
1069 av2 = av2->next;
1070 continue;
1071 }
1072 /* Avoid duplicates in node */
1073 if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1074 && (av1=AttrGetById(node, id))!= NULL)
1075 RemoveAttribute( doc, node, av1 );
1076
1077 /* Move attribute from child to node */
1078 DetachAttribute( child, av2 );
1079 av1 = av2;
1080 av2 = av2->next;
1081 av1->next = NULL;
1082 InsertAttributeAtEnd( node, av1 );
1083 }
1084
1085 return yes;
1086 }
1087
1088 /*
1089 Symptom <XX><XX>...</XX></XX>
1090 Action: merge the two XXs
1091
1092 For instance, this is useful after nested <dir>s used by Word
1093 for indenting have been converted to <div>s
1094
1095 If state is "no", no merging.
1096 If state is "yes", inner element is discarded. Only Style and Class
1097 attributes are merged using MergeStyles().
1098 If state is "auto", atttibutes are merged as described in CopyAttrs().
1099 Style and Class attributes are merged using MergeStyles().
1100 */
1101 static Bool MergeNestedElements( TidyDocImpl* doc,
1102 TidyTagId Id, TidyTriState state, Node *node,
1103 Node **ARG_UNUSED(pnode))
1104 {
1105 Node *child;
1106
1107 if ( state == TidyNoState
1108 || !TagIsId(node, Id) )
1109 return no;
1110
1111 child = node->content;
1112
1113 if ( child == NULL
1114 || child->next != NULL
1115 || !TagIsId(child, Id) )
1116 return no;
1117
1118 if ( state == TidyAutoState
1119 && CopyAttrs(doc, node, child) == no )
1120 return no;
1121
1122 MergeStyles( doc, node, child );
1123 StripOnlyChild( doc, node );
1124 return yes;
1125 }
1126
1127 /*
1128 Symptom: <ul><li><ul>...</ul></li></ul>
1129 Action: discard outer list
1130 */
1131
1132 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1133 {
1134 Node *child, *list;
1135
1136 if ( nodeIsUL(node) || nodeIsOL(node) )
1137 {
1138 child = node->content;
1139
1140 if (child == NULL)
1141 return no;
1142
1143 /* check child has no peers */
1144
1145 if (child->next)
1146 return no;
1147
1148 list = child->content;
1149
1150 if (!list)
1151 return no;
1152
1153 if (list->tag != node->tag)
1154 return no;
1155
1156 /* check list has no peers */
1157 if (list->next)
1158 return no;
1159
1160 *pnode = list; /* Set node to resume iteration */
1161
1162 /* move inner list node into position of outer node */
1163 list->prev = node->prev;
1164 list->next = node->next;
1165 list->parent = node->parent;
1166 FixNodeLinks(list);
1167
1168 /* get rid of outer ul and its li */
1169 child->content = NULL;
1170 FreeNode( doc, child ); /* See test #427841. */
1171 child = NULL;
1172 node->content = NULL;
1173 node->next = NULL;
1174 FreeNode( doc, node );
1175 node = NULL;
1176
1177 /*
1178 If prev node was a list the chances are this node
1179 should be appended to that list. Word has no way of
1180 recognizing nested lists and just uses indents
1181 */
1182
1183 if (list->prev)
1184 {
1185 if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1186 && list->prev->last )
1187 {
1188 node = list;
1189 list = node->prev;
1190
1191 child = list->last; /* <li> */
1192
1193 list->next = node->next;
1194 FixNodeLinks(list);
1195
1196 node->parent = child;
1197 node->next = NULL;
1198 node->prev = child->last;
1199 FixNodeLinks(node);
1200 CleanNode( doc, node );
1201 }
1202 }
1203
1204 return yes;
1205 }
1206
1207 return no;
1208 }
1209
1210 /*
1211 Some necessary conditions to apply BlockStyle().
1212 */
1213
1214 static Bool CanApplyBlockStyle( Node *node )
1215 {
1216 if (node->tag->model & (CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1217 && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1218 {
1219 return yes;
1220 }
1221 return no;
1222 }
1223
1224 /*
1225 Symptom: the only child of a block-level element is a
1226 presentation element such as B, I or FONT
1227
1228 Action: add style "font-weight: bold" to the block and
1229 strip the <b> element, leaving its children.
1230
1231 example:
1232
1233 <p>
1234 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1235 </p>
1236
1237 becomes:
1238
1239 <p style="font-weight: bold; font-family: Arial; font-size: 6">
1240 Draft Recommended Practice
1241 </p>
1242
1243 This code also replaces the align attribute by a style attribute.
1244 However, to avoid CSS problems with Navigator 4, this isn't done
1245 for the elements: caption, tr and table
1246 */
1247 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1248 {
1249 Node *child;
1250
1251 if (CanApplyBlockStyle(node))
1252 {
1253 /* check for align attribute */
1254 if ( !nodeIsCAPTION(node) )
1255 TextAlign( doc, node );
1256
1257 child = node->content;
1258 if (child == NULL)
1259 return no;
1260
1261 /* check child has no peers */
1262 if (child->next)
1263 return no;
1264
1265 if ( nodeIsB(child) )
1266 {
1267 MergeStyles( doc, node, child );
1268 AddStyleProperty( doc, node, "font-weight: bold" );
1269 StripOnlyChild( doc, node );
1270 return yes;
1271 }
1272
1273 if ( nodeIsI(child) )
1274 {
1275 MergeStyles( doc, node, child );
1276 AddStyleProperty( doc, node, "font-style: italic" );
1277 StripOnlyChild( doc, node );
1278 return yes;
1279 }
1280
1281 if ( nodeIsFONT(child) )
1282 {
1283 MergeStyles( doc, node, child );
1284 AddFontStyles( doc, node, child->attributes );
1285 StripOnlyChild( doc, node );
1286 return yes;
1287 }
1288 }
1289
1290 return no;
1291 }
1292
1293 /* the only child of table cell or an inline element such as em */
1294 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1295 {
1296 Node *child;
1297
1298 if ( !nodeIsFONT(node) && nodeHasCM(node, CM_INLINE|CM_ROW) )
1299 {
1300 child = node->content;
1301
1302 if (child == NULL)
1303 return no;
1304
1305 /* check child has no peers */
1306
1307 if (child->next)
1308 return no;
1309
1310 if ( nodeIsB(child) && cfgBool(doc, TidyLogicalEmphasis) )
1311 {
1312 MergeStyles( doc, node, child );
1313 AddStyleProperty( doc, node, "font-weight: bold" );
1314 StripOnlyChild( doc, node );
1315 return yes;
1316 }
1317
1318 if ( nodeIsI(child) && cfgBool(doc, TidyLogicalEmphasis) )
1319 {
1320 MergeStyles( doc, node, child );
1321 AddStyleProperty( doc, node, "font-style: italic" );
1322 StripOnlyChild( doc, node );
1323 return yes;
1324 }
1325
1326 if ( nodeIsFONT(child) )
1327 {
1328 MergeStyles( doc, node, child );
1329 AddFontStyles( doc, node, child->attributes );
1330 StripOnlyChild( doc, node );
1331 return yes;
1332 }
1333 }
1334
1335 return no;
1336 }
1337
1338 /*
1339 Replace font elements by span elements, deleting
1340 the font element's attributes and replacing them
1341 by a single style attribute.
1342 */
1343 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1344 {
1345 AttVal *av, *style, *next;
1346
1347 if ( nodeIsFONT(node) )
1348 {
1349 if ( cfgBool(doc, TidyDropFontTags) )
1350 {
1351 DiscardContainer( doc, node, pnode );
1352 return yes;
1353 }
1354
1355 /* if FONT is only child of parent element then leave alone
1356 Do so only if BlockStyle may be succesful. */
1357 if ( node->parent->content == node && node->next == NULL &&
1358 CanApplyBlockStyle(node->parent) )
1359 return no;
1360
1361 AddFontStyles( doc, node, node->attributes );
1362
1363 /* extract style attribute and free the rest */
1364 av = node->attributes;
1365 style = NULL;
1366
1367 while (av)
1368 {
1369 next = av->next;
1370
1371 if (attrIsSTYLE(av))
1372 {
1373 av->next = NULL;
1374 style = av;
1375 }
1376 else
1377 {
1378 FreeAttribute( doc, av );
1379 }
1380 av = next;
1381 }
1382
1383 node->attributes = style;
1384 RenameElem( node, TidyTag_SPAN );
1385 return yes;
1386 }
1387
1388 return no;
1389 }
1390
1391 /*
1392 Applies all matching rules to a node.
1393 */
1394 Node* CleanNode( TidyDocImpl* doc, Node *node )
1395 {
1396 Node *next = NULL;
1397 TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1398
1399 for (next = node; nodeIsElement(node); node = next)
1400 {
1401 if ( Dir2Div(doc, node, &next) )
1402 continue;
1403
1404 /* Special case: true result means
1405 ** that arg node and its parent no longer exist.
1406 ** So we must jump back up the CreateStyleProperties()
1407 ** call stack until we have a valid node reference.
1408 */
1409 if ( NestedList(doc, node, &next) )
1410 return next;
1411
1412 if ( Center2Div(doc, node, &next) )
1413 continue;
1414
1415 if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1416 continue;
1417
1418 if ( BlockStyle(doc, node, &next) )
1419 continue;
1420
1421 if ( InlineStyle(doc, node, &next) )
1422 continue;
1423
1424 if ( Font2Span(doc, node, &next) )
1425 continue;
1426
1427 break;
1428 }
1429
1430 return next;
1431 }
1432
1433 /* Special case: if the current node is destroyed by
1434 ** CleanNode() lower in the tree, this node and its parent
1435 ** no longer exist. So we must jump back up the CleanTree()
1436 ** call stack until we have a valid node reference.
1437 */
1438
1439 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1440 {
1441 if (node->content)
1442 {
1443 Node *child;
1444 for (child = node->content; child != NULL; child = child->next)
1445 {
1446 child = CleanTree( doc, child );
1447 if ( !child )
1448 break;
1449 }
1450 }
1451
1452 return CleanNode( doc, node );
1453 }
1454
1455 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1456 {
1457 Node *child;
1458
1459 if (node->content)
1460 {
1461 for (child = node->content;
1462 child != NULL; child = child->next)
1463 {
1464 DefineStyleRules( doc, child );
1465 }
1466 }
1467
1468 Style2Rule( doc, node );
1469 }
1470
1471 void CleanDocument( TidyDocImpl* doc )
1472 {
1473 /* placeholder. CleanTree()/CleanNode() will not
1474 ** zap root element
1475 */
1476 CleanTree( doc, &doc->root );
1477
1478 if ( cfgBool(doc, TidyMakeClean) )
1479 {
1480 DefineStyleRules( doc, &doc->root );
1481 CreateStyleElement( doc );
1482 }
1483 }
1484
1485 /* simplifies <b><b> ... </b> ...</b> etc. */
1486 void NestedEmphasis( TidyDocImpl* doc, Node* node )
1487 {
1488 Node *next;
1489
1490 while (node)
1491 {
1492 next = node->next;
1493
1494 if ( (nodeIsB(node) || nodeIsI(node))
1495 && node->parent && node->parent->tag == node->tag)
1496 {
1497 /* strip redundant inner element */
1498 DiscardContainer( doc, node, &next );
1499 node = next;
1500 continue;
1501 }
1502
1503 if ( node->content )
1504 NestedEmphasis( doc, node->content );
1505
1506 node = next;
1507 }
1508 }
1509
1510
1511
1512 /* replace i by em and b by strong */
1513 void EmFromI( TidyDocImpl* doc, Node* node )
1514 {
1515 while (node)
1516 {
1517 if ( nodeIsI(node) )
1518 RenameElem( node, TidyTag_EM );
1519 else if ( nodeIsB(node) )
1520 RenameElem( node, TidyTag_STRONG );
1521
1522 if ( node->content )
1523 EmFromI( doc, node->content );
1524
1525 node = node->next;
1526 }
1527 }
1528
1529 static Bool HasOneChild(Node *node)
1530 {
1531 return (node->content && node->content->next == NULL);
1532 }
1533
1534 /*
1535 Some people use dir or ul without an li
1536 to indent the content. The pattern to
1537 look for is a list with a single implicit
1538 li. This is recursively replaced by an
1539 implicit blockquote.
1540 */
1541 void List2BQ( TidyDocImpl* doc, Node* node )
1542 {
1543 while (node)
1544 {
1545 if (node->content)
1546 List2BQ( doc, node->content );
1547
1548 if ( node->tag && node->tag->parser == ParseList &&
1549 HasOneChild(node) && node->content->implicit )
1550 {
1551 StripOnlyChild( doc, node );
1552 RenameElem( node, TidyTag_BLOCKQUOTE );
1553 node->implicit = yes;
1554 }
1555
1556 node = node->next;
1557 }
1558 }
1559
1560
1561 /*
1562 Replace implicit blockquote by div with an indent
1563 taking care to reduce nested blockquotes to a single
1564 div with the indent set to match the nesting depth
1565 */
1566 void BQ2Div( TidyDocImpl* doc, Node *node )
1567 {
1568 tmbchar indent_buf[ 32 ];
1569 uint indent;
1570
1571 while (node)
1572 {
1573 if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1574 {
1575 indent = 1;
1576
1577 while( HasOneChild(node) &&
1578 nodeIsBLOCKQUOTE(node->content) &&
1579 node->implicit)
1580 {
1581 ++indent;
1582 StripOnlyChild( doc, node );
1583 }
1584
1585 if (node->content)
1586 BQ2Div( doc, node->content );
1587
1588 tmbsnprintf(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1589 2*indent);
1590
1591 RenameElem( node, TidyTag_DIV );
1592 AddStyleProperty(doc, node, indent_buf );
1593 }
1594 else if (node->content)
1595 BQ2Div( doc, node->content );
1596
1597 node = node->next;
1598 }
1599 }
1600
1601
1602 Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1603 {
1604 Node *check;
1605
1606 for ( check=node; check; check = check->parent )
1607 {
1608 if ( nodeIsTD(check) )
1609 return check;
1610 }
1611 return NULL;
1612 }
1613
1614 /* node is <![if ...]> prune up to <![endif]> */
1615 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1616 {
1617 Lexer* lexer = doc->lexer;
1618
1619 for (;;)
1620 {
1621 ctmbstr lexbuf = lexer->lexbuf + node->start;
1622 if ( tmbstrncmp(lexbuf, "if !supportEmptyParas", 21) == 0 )
1623 {
1624 Node* cell = FindEnclosingCell( doc, node );
1625 if ( cell )
1626 {
1627 /* Need to put into cell so it doesn't look weird
1628 */
1629 Node* nbsp = NewLiteralTextNode( lexer, "\240" );
1630 assert( (byte)'\240' == (byte)160 );
1631 InsertNodeBeforeElement( node, nbsp );
1632 }
1633 }
1634
1635 /* discard node and returns next */
1636 node = DiscardElement( doc, node );
1637
1638 if (node == NULL)
1639 return NULL;
1640
1641 if (node->type == SectionTag)
1642 {
1643 if (tmbstrncmp(lexer->lexbuf + node->start, "if", 2) == 0)
1644 {
1645 node = PruneSection( doc, node );
1646 continue;
1647 }
1648
1649 if (tmbstrncmp(lexer->lexbuf + node->start, "endif", 5) == 0)
1650 {
1651 node = DiscardElement( doc, node );
1652 break;
1653 }
1654 }
1655 }
1656
1657 return node;
1658 }
1659
1660 void DropSections( TidyDocImpl* doc, Node* node )
1661 {
1662 Lexer* lexer = doc->lexer;
1663 while (node)
1664 {
1665 if (node->type == SectionTag)
1666 {
1667 /* prune up to matching endif */
1668 if ((tmbstrncmp(lexer->lexbuf + node->start, "if", 2) == 0) &&
1669 (tmbstrncmp(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1670 {
1671 node = PruneSection( doc, node );
1672 continue;
1673 }
1674
1675 /* discard others as well */
1676 node = DiscardElement( doc, node );
1677 continue;
1678 }
1679
1680 if (node->content)
1681 DropSections( doc, node->content );
1682
1683 node = node->next;
1684 }
1685 }
1686
1687 static void PurgeWord2000Attributes( TidyDocImpl* ARG_UNUSED(doc), Node* node )
1688 {
1689 AttVal *attr, *next, *prev = NULL;
1690
1691 for ( attr = node->attributes; attr; attr = next )
1692 {
1693 next = attr->next;
1694
1695 /* special check for class="Code" denoting pre text */
1696 /* Pass thru user defined styles as HTML class names */
1697 if (attrIsCLASS(attr))
1698 {
1699 if (AttrValueIs(attr, "Code") ||
1700 tmbstrncmp(attr->value, "Mso", 3) != 0 )
1701 {
1702 prev = attr;
1703 continue;
1704 }
1705 }
1706
1707 if (attrIsCLASS(attr) ||
1708 attrIsSTYLE(attr) ||
1709 attrIsLANG(attr) ||
1710 ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1711 (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1712 (attr->attribute && tmbstrncmp(attr->attribute, "x:", 2) == 0) )
1713 {
1714 if (prev)
1715 prev->next = next;
1716 else
1717 node->attributes = next;
1718
1719 FreeAttribute( doc, attr );
1720 }
1721 else
1722 prev = attr;
1723 }
1724 }
1725
1726 /* Word2000 uses span excessively, so we strip span out */
1727 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1728 {
1729 Node *node, *prev = NULL, *content;
1730
1731 /*
1732 deal with span elements that have content
1733 by splicing the content in place of the span
1734 after having processed it
1735 */
1736
1737 CleanWord2000( doc, span->content );
1738 content = span->content;
1739
1740 if (span->prev)
1741 prev = span->prev;
1742 else if (content)
1743 {
1744 node = content;
1745 content = content->next;
1746 RemoveNode(node);
1747 InsertNodeBeforeElement(span, node);
1748 prev = node;
1749 }
1750
1751 while (content)
1752 {
1753 node = content;
1754 content = content->next;
1755 RemoveNode(node);
1756 InsertNodeAfterElement(prev, node);
1757 prev = node;
1758 }
1759
1760 if (span->next == NULL)
1761 span->parent->last = prev;
1762
1763 node = span->next;
1764 span->content = NULL;
1765 DiscardElement( doc, span );
1766 return node;
1767 }
1768
1769 /* map non-breaking spaces to regular spaces */
1770 void NormalizeSpaces(Lexer *lexer, Node *node)
1771 {
1772 while ( node )
1773 {
1774 if ( node->content )
1775 NormalizeSpaces( lexer, node->content );
1776
1777 if (nodeIsText(node))
1778 {
1779 uint i, c;
1780 tmbstr p = lexer->lexbuf + node->start;
1781
1782 for (i = node->start; i < node->end; ++i)
1783 {
1784 c = (byte) lexer->lexbuf[i];
1785
1786 /* look for UTF-8 multibyte character */
1787 if ( c > 0x7F )
1788 i += GetUTF8( lexer->lexbuf + i, &c );
1789
1790 if ( c == 160 )
1791 c = ' ';
1792
1793 p = PutUTF8(p, c);
1794 }
1795 node->end = p - lexer->lexbuf;
1796 }
1797
1798 node = node->next;
1799 }
1800 }
1801
1802 /* used to hunt for hidden preformatted sections */
1803 Bool NoMargins(Node *node)
1804 {
1805 AttVal *attval = AttrGetById(node, TidyAttr_STYLE);
1806
1807 if ( !AttrHasValue(attval) )
1808 return no;
1809
1810 /* search for substring "margin-top: 0" */
1811 if (!tmbsubstr(attval->value, "margin-top: 0"))
1812 return no;
1813
1814 /* search for substring "margin-bottom: 0" */
1815 if (!tmbsubstr(attval->value, "margin-bottom: 0"))
1816 return no;
1817
1818 return yes;
1819 }
1820
1821 /* does element have a single space as its content? */
1822 static Bool SingleSpace( Lexer* lexer, Node* node )
1823 {
1824 if ( node->content )
1825 {
1826 node = node->content;
1827
1828 if ( node->next != NULL )
1829 return no;
1830
1831 if ( node->type != TextNode )
1832 return no;
1833
1834 if ( (node->end - node->start) == 1 &&
1835 lexer->lexbuf[node->start] == ' ' )
1836 return yes;
1837
1838 if ( (node->end - node->start) == 2 )
1839 {
1840 uint c = 0;
1841 GetUTF8( lexer->lexbuf + node->start, &c );
1842 if ( c == 160 )
1843 return yes;
1844 }
1845 }
1846
1847 return no;
1848 }
1849
1850 /*
1851 This is a major clean up to strip out all the extra stuff you get
1852 when you save as web page from Word 2000. It doesn't yet know what
1853 to do with VML tags, but these will appear as errors unless you
1854 declare them as new tags, such as o:p which needs to be declared
1855 as inline.
1856 */
1857 void CleanWord2000( TidyDocImpl* doc, Node *node)
1858 {
1859 /* used to a list from a sequence of bulletted p's */
1860 Lexer* lexer = doc->lexer;
1861 Node* list = NULL;
1862
1863 while ( node )
1864 {
1865 /* get rid of Word's xmlns attributes */
1866 if ( nodeIsHTML(node) )
1867 {
1868 /* check that it's a Word 2000 document */
1869 if ( !GetAttrByName(node, "xmlns:o") &&
1870 !cfgBool(doc, TidyMakeBare) )
1871 return;
1872
1873 FreeAttrs( doc, node );
1874 }
1875
1876 /* fix up preformatted sections by looking for a
1877 ** sequence of paragraphs with zero top/bottom margin
1878 */
1879 if ( nodeIsP(node) )
1880 {
1881 if (NoMargins(node))
1882 {
1883 Node *pre, *next;
1884 CoerceNode(doc, node, TidyTag_PRE, no, yes);
1885
1886 PurgeWord2000Attributes( doc, node );
1887
1888 if (node->content)
1889 CleanWord2000( doc, node->content );
1890
1891 pre = node;
1892 node = node->next;
1893
1894 /* continue to strip p's */
1895
1896 while ( nodeIsP(node) && NoMargins(node) )
1897 {
1898 next = node->next;
1899 RemoveNode(node);
1900 InsertNodeAtEnd(pre, NewLineNode(lexer));
1901 InsertNodeAtEnd(pre, node);
1902 StripSpan( doc, node );
1903 node = next;
1904 }
1905
1906 if (node == NULL)
1907 break;
1908 }
1909 }
1910
1911 if (node->tag && (node->tag->model & CM_BLOCK)
1912 && SingleSpace(lexer, node))
1913 {
1914 node = StripSpan( doc, node );
1915 continue;
1916 }
1917 /* discard Word's style verbiage */
1918 if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1919 node->type == CommentTag )
1920 {
1921 node = DiscardElement( doc, node );
1922 continue;
1923 }
1924
1925 /* strip out all span and font tags Word scatters so liberally! */
1926 if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1927 {
1928 node = StripSpan( doc, node );
1929 continue;
1930 }
1931
1932 if ( nodeIsLINK(node) )
1933 {
1934 AttVal *attr = AttrGetById(node, TidyAttr_REL);
1935
1936 if (AttrValueIs(attr, "File-List"))
1937 {
1938 node = DiscardElement( doc, node );
1939 continue;
1940 }
1941 }
1942
1943 /* discards <o:p> which encodes the paragraph mark */
1944 if ( node->tag && tmbstrcmp(node->tag->name,"o:p")==0)
1945 {
1946 Node* next;
1947 DiscardContainer( doc, node, &next );
1948 node = next;
1949 continue;
1950 }
1951
1952 /* discard empty paragraphs */
1953
1954 if ( node->content == NULL && nodeIsP(node) )
1955 {
1956 /* Use the existing function to ensure consistency */
1957 node = TrimEmptyElement( doc, node );
1958 continue;
1959 }
1960
1961 if ( nodeIsP(node) )
1962 {
1963 AttVal *attr, *atrStyle;
1964
1965 attr = AttrGetById(node, TidyAttr_CLASS);
1966 atrStyle = AttrGetById(node, TidyAttr_STYLE);
1967 /*
1968 (JES) Sometimes Word marks a list item with the following hokie syntax
1969 <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
1970 translate these into <li>
1971 */
1972 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1973 /* map <p class="MsoListNumber"> to <ol>...</ol> */
1974 if ( AttrValueIs(attr, "MsoListBullet") ||
1975 AttrValueIs(attr, "MsoListNumber") ||
1976 AttrContains(atrStyle, "mso-list:") )
1977 {
1978 TidyTagId listType = TidyTag_UL;
1979 if (AttrValueIs(attr, "MsoListNumber"))
1980 listType = TidyTag_OL;
1981
1982 CoerceNode(doc, node, TidyTag_LI, no, yes);
1983
1984 if ( !list || TagId(list) != listType )
1985 {
1986 const Dict* tag = LookupTagDef( listType );
1987 list = InferredTag(doc, tag->id);
1988 InsertNodeBeforeElement(node, list);
1989 }
1990
1991 PurgeWord2000Attributes( doc, node );
1992
1993 if ( node->content )
1994 CleanWord2000( doc, node->content );
1995
1996 /* remove node and append to contents of list */
1997 RemoveNode(node);
1998 InsertNodeAtEnd(list, node);
1999 node = list;
2000 }
2001 /* map sequence of <p class="Code"> to <pre>...</pre> */
2002 else if (AttrValueIs(attr, "Code"))
2003 {
2004 Node *br = NewLineNode(lexer);
2005 NormalizeSpaces(lexer, node->content);
2006
2007 if ( !list || TagId(list) != TidyTag_PRE )
2008 {
2009 list = InferredTag(doc, TidyTag_PRE);
2010 InsertNodeBeforeElement(node, list);
2011 }
2012
2013 /* remove node and append to contents of list */
2014 RemoveNode(node);
2015 InsertNodeAtEnd(list, node);
2016 StripSpan( doc, node );
2017 InsertNodeAtEnd(list, br);
2018 node = list->next;
2019 }
2020 else
2021 list = NULL;
2022 }
2023 else
2024 list = NULL;
2025
2026 if (!node)
2027 return;
2028
2029 /* strip out style and class attributes */
2030 if (nodeIsElement(node))
2031 PurgeWord2000Attributes( doc, node );
2032
2033 if (node->content)
2034 CleanWord2000( doc, node->content );
2035
2036 node = node->next;
2037 }
2038 }
2039
2040 Bool IsWord2000( TidyDocImpl* doc )
2041 {
2042 AttVal *attval;
2043 Node *node, *head;
2044 Node *html = FindHTML( doc );
2045
2046 if (html && GetAttrByName(html, "xmlns:o"))
2047 return yes;
2048
2049 /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2050 head = FindHEAD( doc );
2051
2052 if (head)
2053 {
2054 for (node = head->content; node; node = node->next)
2055 {
2056 if ( !nodeIsMETA(node) )
2057 continue;
2058
2059 attval = AttrGetById( node, TidyAttr_NAME );
2060
2061 if ( !AttrValueIs(attval, "generator") )
2062 continue;
2063
2064 attval = AttrGetById( node, TidyAttr_CONTENT );
2065
2066 if ( AttrContains(attval, "Microsoft") )
2067 return yes;
2068 }
2069 }
2070
2071 return no;
2072 }
2073
2074 /* where appropriate move object elements from head to body */
2075 void BumpObject( TidyDocImpl* doc, Node *html )
2076 {
2077 Node *node, *next, *head = NULL, *body = NULL;
2078
2079 if (!html)
2080 return;
2081
2082 for ( node = html->content; node != NULL; node = node->next )
2083 {
2084 if ( nodeIsHEAD(node) )
2085 head = node;
2086
2087 if ( nodeIsBODY(node) )
2088 body = node;
2089 }
2090
2091 if ( head != NULL && body != NULL )
2092 {
2093 for (node = head->content; node != NULL; node = next)
2094 {
2095 next = node->next;
2096
2097 if ( nodeIsOBJECT(node) )
2098 {
2099 Node *child;
2100 Bool bump = no;
2101
2102 for (child = node->content; child != NULL; child = child->next)
2103 {
2104 /* bump to body unless content is param */
2105 if ( (nodeIsText(child) && !IsBlank(doc->lexer, node))
2106 || !nodeIsPARAM(child) )
2107 {
2108 bump = yes;
2109 break;
2110 }
2111 }
2112
2113 if ( bump )
2114 {
2115 RemoveNode( node );
2116 InsertNodeAtStart( body, node );
2117 }
2118 }
2119 }
2120 }
2121 }
2122
2123 /* This is disabled due to http://tidy.sf.net/bug/681116 */
2124 #if 0
2125 void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2126 {
2127 Node *pNode;
2128 Bool bBRDeleted = no;
2129
2130 if (NULL == pParent)
2131 return;
2132
2133 /* First, check the status of All My Children */
2134 pNode = pParent->content;
2135 while (NULL != pNode )
2136 {
2137 /* The node may get trimmed, so save the next pointer, if any */
2138 Node *pNext = pNode->next;
2139 FixBrakes( pDoc, pNode );
2140 pNode = pNext;
2141 }
2142
2143
2144 /* As long as my last child is a <br />, move it to my last peer */
2145 if ( nodeCMIsBlock( pParent ))
2146 {
2147 for ( pNode = pParent->last;
2148 NULL != pNode && nodeIsBR( pNode );
2149 pNode = pParent->last )
2150 {
2151 if ( NULL == pNode->attributes && no == bBRDeleted )
2152 {
2153 DiscardElement( pDoc, pNode );
2154 bBRDeleted = yes;
2155 }
2156 else
2157 {
2158 RemoveNode( pNode );
2159 InsertNodeAfterElement( pParent, pNode );
2160 }
2161 }
2162 TrimEmptyElement( pDoc, pParent );
2163 }
2164 }
2165 #endif
2166
2167 void VerifyHTTPEquiv(TidyDocImpl* pDoc, Node *head)
2168 {
2169 Node *pNode;
2170 StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2171 tmbstr s, pszBegin, pszEnd;
2172 ctmbstr enc = GetEncodingNameFromTidyId(cfg(pDoc, TidyOutCharEncoding));
2173
2174 if (!enc)
2175 return;
2176
2177 if (!nodeIsHEAD(head))
2178 head = FindHEAD(pDoc);
2179
2180 if (!head)
2181 return;
2182
2183 /* Find any <meta http-equiv='Content-Type' content='...' /> */
2184 for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2185 {
2186 AttVal* httpEquiv = AttrGetById(pNode, TidyAttr_HTTP_EQUIV);
2187 AttVal* metaContent = AttrGetById(pNode, TidyAttr_CONTENT);
2188
2189 if ( !nodeIsMETA(pNode) || !metaContent ||
2190 !AttrValueIs(httpEquiv, "Content-Type") )
2191 continue;
2192
2193 pszBegin = s = tmbstrdup( metaContent->value );
2194 while (pszBegin && *pszBegin)
2195 {
2196 while (isspace( *pszBegin ))
2197 pszBegin++;
2198 pszEnd = pszBegin;
2199 while ('\0' != *pszEnd && ';' != *pszEnd)
2200 pszEnd++;
2201 if (';' == *pszEnd )
2202 *(pszEnd++) = '\0';
2203 if (pszEnd > pszBegin)
2204 {
2205 prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
2206 prop->name = tmbstrdup( pszBegin );
2207 prop->value = NULL;
2208 prop->next = NULL;
2209
2210 if (NULL != pLastProp)
2211 pLastProp->next = prop;
2212 else
2213 pFirstProp = prop;
2214
2215 pLastProp = prop;
2216 pszBegin = pszEnd;
2217 }
2218 }
2219 MemFree( s );
2220
2221 /* find the charset property */
2222 for (prop = pFirstProp; NULL != prop; prop = prop->next)
2223 {
2224 if (0 != tmbstrncasecmp( prop->name, "charset", 7 ))
2225 continue;
2226
2227 MemFree( prop->name );
2228 prop->name = (tmbstr)MemAlloc( 8 + tmbstrlen(enc) + 1 );
2229 tmbstrcpy(prop->name, "charset=");
2230 tmbstrcpy(prop->name+8, enc);
2231 s = CreatePropString( pFirstProp );
2232 MemFree( metaContent->value );
2233 metaContent->value = s;
2234 break;
2235 }
2236 /* #718127, prevent memory leakage */
2237 FreeStyleProps(pFirstProp);
2238 pFirstProp = NULL;
2239 pLastProp = NULL;
2240 }
2241 }
2242
2243 void DropComments(TidyDocImpl* doc, Node* node)
2244 {
2245 Node* next;
2246
2247 while (node)
2248 {
2249 next = node->next;
2250
2251 if (node->type == CommentTag)
2252 {
2253 RemoveNode(node);
2254 FreeNode(doc, node);
2255 node = next;
2256 continue;
2257 }
2258
2259 if (node->content)
2260 DropComments(doc, node->content);
2261
2262 node = next;
2263 }
2264 }
2265
2266 void DropFontElements(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2267 {
2268 Node* next;
2269
2270 while (node)
2271 {
2272 next = node->next;
2273
2274 if (nodeIsFONT(node))
2275 {
2276 DiscardContainer(doc, node, &next);
2277 node = next;
2278 continue;
2279 }
2280
2281 if (node->content)
2282 DropFontElements(doc, node->content, &next);
2283
2284 node = next;
2285 }
2286 }
2287
2288 void WbrToSpace(TidyDocImpl* doc, Node* node)
2289 {
2290 Node* next;
2291
2292 while (node)
2293 {
2294 next = node->next;
2295
2296 if (nodeIsWBR(node))
2297 {
2298 Node* text;
2299 text = NewLiteralTextNode(doc->lexer, " ");
2300 InsertNodeAfterElement(node, text);
2301 RemoveNode(node);
2302 FreeNode(doc, node);
2303 node = next;
2304 continue;
2305 }
2306
2307 if (node->content)
2308 WbrToSpace(doc, node->content);
2309
2310 node = next;
2311 }
2312 }
2313
2314 /*
2315 Filters from Word and PowerPoint often use smart
2316 quotes resulting in character codes between 128
2317 and 159. Unfortunately, the corresponding HTML 4.0
2318 entities for these are not widely supported. The
2319 following converts dashes and quotation marks to
2320 the nearest ASCII equivalent. My thanks to
2321 Andrzej Novosiolov for his help with this code.
2322
2323 Note: The old code in the pretty printer applied
2324 this to all node types and attribute values while
2325 this routine applies it only to text nodes. First,
2326 Microsoft Office products rarely put the relevant
2327 characters into these tokens, second support for
2328 them is much better now and last but not least, it
2329 can be harmful to replace these characters since
2330 US-ASCII quote marks are often used as syntax
2331 characters, a simple
2332
2333 <a onmouseover="alert('‘')">...</a>
2334
2335 would be broken if the U+2018 is replaced by "'".
2336 The old code would neither take care whether the
2337 quote mark is already used as delimiter,
2338
2339 <p title='‘'>...</p>
2340
2341 got
2342
2343 <p title='''>...</p>
2344
2345 Since browser support is much better nowadays and
2346 high-quality typography is better than ASCII it'd
2347 be probably a good idea to drop the feature...
2348 */
2349 void DowngradeTypography(TidyDocImpl* doc, Node* node)
2350 {
2351 Node* next;
2352 Lexer* lexer = doc->lexer;
2353
2354 while (node)
2355 {
2356 next = node->next;
2357
2358 if (nodeIsText(node))
2359 {
2360 uint i, c;
2361 tmbstr p = lexer->lexbuf + node->start;
2362
2363 for (i = node->start; i < node->end; ++i)
2364 {
2365 c = (unsigned char) lexer->lexbuf[i];
2366
2367 if (c > 0x7F)
2368 i += GetUTF8(lexer->lexbuf + i, &c);
2369
2370 if (c >= 0x2013 && c <= 0x201E)
2371 {
2372 switch (c)
2373 {
2374 case 0x2013: /* en dash */
2375 case 0x2014: /* em dash */
2376 c = '-';
2377 break;
2378 case 0x2018: /* left single quotation mark */
2379 case 0x2019: /* right single quotation mark */
2380 case 0x201A: /* single low-9 quotation mark */
2381 c = '\'';
2382 break;
2383 case 0x201C: /* left double quotation mark */
2384 case 0x201D: /* right double quotation mark */
2385 case 0x201E: /* double low-9 quotation mark */
2386 c = '"';
2387 break;
2388 }
2389 }
2390
2391 p = PutUTF8(p, c);
2392 }
2393
2394 node->end = p - lexer->lexbuf;
2395 }
2396
2397 if (node->content)
2398 DowngradeTypography(doc, node->content);
2399
2400 node = next;
2401 }
2402 }
2403
2404 void ReplacePreformattedSpaces(TidyDocImpl* doc, Node* node)
2405 {
2406 Node* next;
2407
2408 while (node)
2409 {
2410 next = node->next;
2411
2412 if (node->tag && node->tag->parser == ParsePre)
2413 {
2414 NormalizeSpaces(doc->lexer, node->content);
2415 node = next;
2416 continue;
2417 }
2418
2419 if (node->content)
2420 ReplacePreformattedSpaces(doc, node->content);
2421
2422 node = next;
2423 }
2424 }
2425
2426 void ConvertCDATANodes(TidyDocImpl* doc, Node* node)
2427 {
2428 Node* next;
2429
2430 while (node)
2431 {
2432 next = node->next;
2433
2434 if (node->type == CDATATag)
2435 node->type = TextNode;
2436
2437 if (node->content)
2438 ConvertCDATANodes(doc, node->content);
2439
2440 node = next;
2441 }
2442 }
2443
2444 /*
2445 FixLanguageInformation ensures that the document contains (only)
2446 the attributes for language information desired by the output
2447 document type. For example, for XHTML 1.0 documents both
2448 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2449 is desired and for HTML 4.01 only 'lang' is desired.
2450 */
2451 void FixLanguageInformation(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2452 {
2453 Node* next;
2454
2455 while (node)
2456 {
2457 next = node->next;
2458
2459 /* todo: report modifications made here to the report system */
2460
2461 if (nodeIsElement(node))
2462 {
2463 AttVal* lang = AttrGetById(node, TidyAttr_LANG);
2464 AttVal* xmlLang = AttrGetById(node, TidyAttr_XML_LANG);
2465
2466 if (lang && xmlLang)
2467 {
2468 /*
2469 todo: check whether both attributes are in sync,
2470 here or elsewhere, where elsewhere is probably
2471 preferable.
2472 AD - March 2005: not mandatory according the standards.
2473 */
2474 }
2475 else if (lang && wantXmlLang)
2476 {
2477 if (NodeAttributeVersions( node, TidyAttr_XML_LANG )
2478 & doc->lexer->versionEmitted)
2479 RepairAttrValue(doc, node, "xml:lang", lang->value);
2480 }
2481 else if (xmlLang && wantLang)
2482 {
2483 if (NodeAttributeVersions( node, TidyAttr_LANG )
2484 & doc->lexer->versionEmitted)
2485 RepairAttrValue(doc, node, "lang", xmlLang->value);
2486 }
2487
2488 if (lang && !wantLang)
2489 RemoveAttribute(doc, node, lang);
2490
2491 if (xmlLang && !wantXmlLang)
2492 RemoveAttribute(doc, node, xmlLang);
2493 }
2494
2495 if (node->content)
2496 FixLanguageInformation(doc, node->content, wantXmlLang, wantLang);
2497
2498 node = next;
2499 }
2500 }
2501
2502 /*
2503 Set/fix/remove <html xmlns='...'>
2504 */
2505 void FixXhtmlNamespace(TidyDocImpl* doc, Bool wantXmlns)
2506 {
2507 Node* html = FindHTML(doc);
2508 AttVal* xmlns;
2509
2510 if (!html)
2511 return;
2512
2513 xmlns = AttrGetById(html, TidyAttr_XMLNS);
2514
2515 if (wantXmlns)
2516 {
2517 if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2518 RepairAttrValue(doc, html, "xmlns", XHTML_NAMESPACE);
2519 }
2520 else if (xmlns)
2521 {
2522 RemoveAttribute(doc, html, xmlns);
2523 }
2524 }
2525
2526 /*
2527 ...
2528 */
2529 void FixAnchors(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2530 {
2531 Node* next;
2532
2533 while (node)
2534 {
2535 next = node->next;
2536
2537 if (IsAnchorElement(doc, node))
2538 {
2539 AttVal *name = AttrGetById(node, TidyAttr_NAME);
2540 AttVal *id = AttrGetById(node, TidyAttr_ID);
2541
2542 /* todo: how are empty name/id attributes handled? */
2543
2544 if (name && id)
2545 {
2546 Bool NameHasValue = AttrHasValue(name);
2547 Bool IdHasValue = AttrHasValue(id);
2548 if ( (NameHasValue != IdHasValue) ||
2549 (NameHasValue && IdHasValue &&
2550 tmbstrcmp(name->value, id->value) != 0 ) )
2551 ReportAttrError( doc, node, name, ID_NAME_MISMATCH);
2552 }
2553 else if (name && wantId)
2554 {
2555 if (NodeAttributeVersions( node, TidyAttr_ID )
2556 & doc->lexer->versionEmitted)
2557 {
2558 if (IsValidHTMLID(name->value))
2559 {
2560 RepairAttrValue(doc, node, "id", name->value);
2561 }
2562 else
2563 {
2564 ReportAttrError(doc, node, name, INVALID_XML_ID);
2565 }
2566 }
2567 }
2568 else if (id && wantName)
2569 {
2570 if (NodeAttributeVersions( node, TidyAttr_NAME )
2571 & doc->lexer->versionEmitted)
2572 /* todo: do not assume id is valid */
2573 RepairAttrValue(doc, node, "name", id->value);
2574 }
2575
2576 if (id && !wantId)
2577 RemoveAttribute(doc, node, id);
2578
2579 if (name && !wantName)
2580 RemoveAttribute(doc, node, name);
2581
2582 if (AttrGetById(node, TidyAttr_NAME) == NULL &&
2583 AttrGetById(node, TidyAttr_ID) == NULL)
2584 RemoveAnchorByNode(doc, node);
2585 }
2586
2587 if (node->content)
2588 FixAnchors(doc, node->content, wantName, wantId);
2589
2590 node = next;
2591 }
2592 }
2593
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.