Version:
~ [ 1.0 ] ~
1 /*
2 clean.c -- clean up misuse of presentation markup
3
4 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
5 See tidy.h for the copyright notice.
6
7 CVS Info :
8
9 $Author: arnaud02 $
10 $Date: 2005/08/03 18:06:59 $
11 $Revision: 1.98 $
12
13 Filters from other formats such as Microsoft Word
14 often make excessive use of presentation markup such
15 as font tags, B, I, and the align attribute. By applying
16 a set of production rules, it is straight forward to
17 transform this to use CSS.
18
19 Some rules replace some of the children of an element by
20 style properties on the element, e.g.
21
22 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
23
24 Such rules are applied to the element's content and then
25 to the element itself until none of the rules more apply.
26 Having applied all the rules to an element, it will have
27 a style attribute with one or more properties.
28
29 Other rules strip the element they apply to, replacing
30 it by style properties on the contents, e.g.
31
32 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
33
34 These rules are applied to an element before processing
35 its content and replace the current element by the first
36 element in the exposed content.
37
38 After applying both sets of rules, you can replace the
39 style attribute by a class value and style rule in the
40 document head. To support this, an association of styles
41 and class names is built.
42
43 A naive approach is to rely on string matching to test
44 when two property lists are the same. A better approach
45 would be to first sort the properties before matching.
46
47 */
48
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52
53 #include "tidy-int.h"
54 #include "clean.h"
55 #include "lexer.h"
56 #include "parser.h"
57 #include "attrs.h"
58 #include "message.h"
59 #include "tmbstr.h"
60 #include "utf8.h"
61
62 void RenameElem( Node* node, TidyTagId tid )
63 {
64 const Dict* dict = LookupTagDef( tid );
65 MemFree( node->element );
66 node->element = tmbstrdup( dict->name );
67 node->tag = dict;
68 }
69
70 static void FreeStyleProps(StyleProp *props)
71 {
72 StyleProp *next;
73
74 while (props)
75 {
76 next = props->next;
77 MemFree(props->name);
78 MemFree(props->value);
79 MemFree(props);
80 props = next;
81 }
82 }
83
84 static StyleProp *InsertProperty( StyleProp* props, ctmbstr name, ctmbstr value )
85 {
86 StyleProp *first, *prev, *prop;
87 int cmp;
88
89 prev = NULL;
90 first = props;
91
92 while (props)
93 {
94 cmp = tmbstrcmp(props->name, name);
95
96 if (cmp == 0)
97 {
98 /* this property is already defined, ignore new value */
99 return first;
100 }
101
102 if (cmp > 0)
103 {
104 /* insert before this */
105
106 prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
107 prop->name = tmbstrdup(name);
108 prop->value = tmbstrdup(value);
109 prop->next = props;
110
111 if (prev)
112 prev->next = prop;
113 else
114 first = prop;
115
116 return first;
117 }
118
119 prev = props;
120 props = props->next;
121 }
122
123 prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
124 prop->name = tmbstrdup(name);
125 prop->value = tmbstrdup(value);
126 prop->next = NULL;
127
128 if (prev)
129 prev->next = prop;
130 else
131 first = prop;
132
133 return first;
134 }
135
136 /*
137 Create sorted linked list of properties from style string
138 It temporarily places nulls in place of ':' and ';' to
139 delimit the strings for the property name and value.
140 Some systems don't allow you to NULL literal strings,
141 so to avoid this, a copy is made first.
142 */
143 static StyleProp* CreateProps( StyleProp* prop, ctmbstr style )
144 {
145 tmbstr name, value = NULL, name_end, value_end, line;
146 Bool more;
147
148 line = tmbstrdup(style);
149 name = line;
150
151 while (*name)
152 {
153 while (*name == ' ')
154 ++name;
155
156 name_end = name;
157
158 while (*name_end)
159 {
160 if (*name_end == ':')
161 {
162 value = name_end + 1;
163 break;
164 }
165
166 ++name_end;
167 }
168
169 if (*name_end != ':')
170 break;
171
172 while ( value && *value == ' ')
173 ++value;
174
175 value_end = value;
176 more = no;
177
178 while (*value_end)
179 {
180 if (*value_end == ';')
181 {
182 more = yes;
183 break;
184 }
185
186 ++value_end;
187 }
188
189 *name_end = '\0';
190 *value_end = '\0';
191
192 prop = InsertProperty(prop, name, value);
193 *name_end = ':';
194
195 if (more)
196 {
197 *value_end = ';';
198 name = value_end + 1;
199 continue;
200 }
201
202 break;
203 }
204
205 MemFree(line); /* free temporary copy */
206 return prop;
207 }
208
209 static tmbstr CreatePropString(StyleProp *props)
210 {
211 tmbstr style, p, s;
212 uint len;
213 StyleProp *prop;
214
215 /* compute length */
216
217 for (len = 0, prop = props; prop; prop = prop->next)
218 {
219 len += tmbstrlen(prop->name) + 2;
220 if (prop->value)
221 len += tmbstrlen(prop->value) + 2;
222 }
223
224 style = (tmbstr) MemAlloc(len+1);
225 style[0] = '\0';
226
227 for (p = style, prop = props; prop; prop = prop->next)
228 {
229 s = prop->name;
230
231 while((*p++ = *s++))
232 continue;
233
234 if (prop->value)
235 {
236 *--p = ':';
237 *++p = ' ';
238 ++p;
239
240 s = prop->value;
241 while((*p++ = *s++))
242 continue;
243 }
244 if (prop->next == NULL)
245 break;
246
247 *--p = ';';
248 *++p = ' ';
249 ++p;
250 }
251
252 return style;
253 }
254
255 /*
256 create string with merged properties
257 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
258 {
259 tmbstr line;
260 StyleProp *prop;
261
262 prop = CreateProps(NULL, style);
263 prop = CreateProps(prop, property);
264 line = CreatePropString(prop);
265 FreeStyleProps(prop);
266 return line;
267 }
268 */
269
270 void FreeStyles( TidyDocImpl* doc )
271 {
272 Lexer* lexer = doc->lexer;
273 if ( lexer )
274 {
275 TagStyle *style, *next;
276 for ( style = lexer->styles; style; style = next )
277 {
278 next = style->next;
279 MemFree( style->tag );
280 MemFree( style->tag_class );
281 MemFree( style->properties );
282 MemFree( style );
283 }
284 }
285 }
286
287 static tmbstr GensymClass( TidyDocImpl* doc )
288 {
289 tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
290 ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
291 if ( pfx == NULL || *pfx == 0 )
292 pfx = "c";
293
294 tmbsnprintf(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
295 return tmbstrdup(buf);
296 }
297
298 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
299 {
300 Lexer* lexer = doc->lexer;
301 TagStyle* style;
302
303 for (style = lexer->styles; style; style=style->next)
304 {
305 if (tmbstrcmp(style->tag, tag) == 0 &&
306 tmbstrcmp(style->properties, properties) == 0)
307 return style->tag_class;
308 }
309
310 style = (TagStyle *)MemAlloc( sizeof(TagStyle) );
311 style->tag = tmbstrdup(tag);
312 style->tag_class = GensymClass( doc );
313 style->properties = tmbstrdup( properties );
314 style->next = lexer->styles;
315 lexer->styles = style;
316 return style->tag_class;
317 }
318
319 /*
320 Add class="foo" to node
321 */
322 void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
323 {
324 AttVal *classattr = AttrGetById(node, TidyAttr_CLASS);;
325
326 /*
327 if there already is a class attribute
328 then append class name after a space.
329 */
330 if (classattr)
331 {
332 uint len = tmbstrlen(classattr->value) +
333 tmbstrlen(classname) + 2;
334 tmbstr s = (tmbstr) MemAlloc( len );
335 tmbstrcpy( s, classattr->value );
336 tmbstrcat( s, " " );
337 tmbstrcat( s, classname );
338 MemFree( classattr->value );
339 classattr->value = s;
340 }
341 else /* create new class attribute */
342 AddAttribute( doc, node, "class", classname );
343 }
344
345
346 /*
347 Find style attribute in node, and replace it
348 by corresponding class attribute. Search for
349 class in style dictionary otherwise gensym
350 new class and add to dictionary.
351
352 Assumes that node doesn't have a class attribute
353 */
354 static void Style2Rule( TidyDocImpl* doc, Node *node)
355 {
356 AttVal *styleattr, *classattr;
357 ctmbstr classname;
358
359 styleattr = AttrGetById(node, TidyAttr_STYLE);
360
361 if (styleattr)
362 {
363 /* fix for http://tidy.sf.net/bug/850215 */
364 if (!styleattr->value)
365 {
366 RemoveAttribute(doc, node, styleattr);
367 return;
368 }
369
370 classname = FindStyle( doc, node->element, styleattr->value );
371 classattr = AttrGetById(node, TidyAttr_CLASS);
372
373 /*
374 if there already is a class attribute
375 then append class name after an underscore
376 */
377 if (classattr)
378 {
379 uint len = tmbstrlen(classattr->value) +
380 tmbstrlen(classname) + 2;
381 tmbstr s = (tmbstr) MemAlloc( len );
382 s[0] = '\0';
383 if (classattr->value)
384 {
385 tmbstrcpy(s, classattr->value);
386 tmbstrcat(s, " ");
387 }
388 tmbstrcat(s, classname);
389 if (classattr->value)
390 MemFree(classattr->value);
391 classattr->value = s;
392 RemoveAttribute( doc, node, styleattr );
393 }
394 else /* reuse style attribute for class attribute */
395 {
396 MemFree(styleattr->attribute);
397 MemFree(styleattr->value);
398 styleattr->attribute = tmbstrdup("class");
399 styleattr->value = tmbstrdup(classname);
400 }
401 }
402 }
403
404 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
405 {
406 if ( selector && color )
407 {
408 AddStringLiteral(lexer, selector);
409 AddStringLiteral(lexer, " { color: ");
410 AddStringLiteral(lexer, color);
411 AddStringLiteral(lexer, " }\n");
412 }
413 }
414
415 /*
416 move presentation attribs from body to style element
417
418 background="foo" -> body { background-image: url(foo) }
419 bgcolor="foo" -> body { background-color: foo }
420 text="foo" -> body { color: foo }
421 link="foo" -> :link { color: foo }
422 vlink="foo" -> :visited { color: foo }
423 alink="foo" -> :active { color: foo }
424 */
425 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
426 {
427 Lexer* lexer = doc->lexer;
428 tmbstr bgurl = NULL;
429 tmbstr bgcolor = NULL;
430 tmbstr color = NULL;
431 AttVal* attr;
432
433 if (NULL != (attr = AttrGetById(body, TidyAttr_BACKGROUND)))
434 {
435 bgurl = attr->value;
436 attr->value = NULL;
437 RemoveAttribute( doc, body, attr );
438 }
439
440 if (NULL != (attr = AttrGetById(body, TidyAttr_BGCOLOR)))
441 {
442 bgcolor = attr->value;
443 attr->value = NULL;
444 RemoveAttribute( doc, body, attr );
445 }
446
447 if (NULL != (attr = AttrGetById(body, TidyAttr_TEXT)))
448 {
449 color = attr->value;
450 attr->value = NULL;
451 RemoveAttribute( doc, body, attr );
452 }
453
454 if ( bgurl || bgcolor || color )
455 {
456 AddStringLiteral(lexer, " body {\n");
457 if (bgurl)
458 {
459 AddStringLiteral(lexer, " background-image: url(");
460 AddStringLiteral(lexer, bgurl);
461 AddStringLiteral(lexer, ");\n");
462 MemFree(bgurl);
463 }
464 if (bgcolor)
465 {
466 AddStringLiteral(lexer, " background-color: ");
467 AddStringLiteral(lexer, bgcolor);
468 AddStringLiteral(lexer, ";\n");
469 MemFree(bgcolor);
470 }
471 if (color)
472 {
473 AddStringLiteral(lexer, " color: ");
474 AddStringLiteral(lexer, color);
475 AddStringLiteral(lexer, ";\n");
476 MemFree(color);
477 }
478
479 AddStringLiteral(lexer, " }\n");
480 }
481
482 if (NULL != (attr = AttrGetById(body, TidyAttr_LINK)))
483 {
484 AddColorRule(lexer, " :link", attr->value);
485 RemoveAttribute( doc, body, attr );
486 }
487
488 if (NULL != (attr = AttrGetById(body, TidyAttr_VLINK)))
489 {
490 AddColorRule(lexer, " :visited", attr->value);
491 RemoveAttribute( doc, body, attr );
492 }
493
494 if (NULL != (attr = AttrGetById(body, TidyAttr_ALINK)))
495 {
496 AddColorRule(lexer, " :active", attr->value);
497 RemoveAttribute( doc, body, attr );
498 }
499 }
500
501 static Bool NiceBody( TidyDocImpl* doc )
502 {
503 Node* node = FindBody(doc);
504 if (node)
505 {
506 if (AttrGetById(node, TidyAttr_BACKGROUND) ||
507 AttrGetById(node, TidyAttr_BGCOLOR) ||
508 AttrGetById(node, TidyAttr_TEXT) ||
509 AttrGetById(node, TidyAttr_LINK) ||
510 AttrGetById(node, TidyAttr_VLINK) ||
511 AttrGetById(node, TidyAttr_ALINK))
512 {
513 doc->badLayout |= USING_BODY;
514 return no;
515 }
516 }
517
518 return yes;
519 }
520
521 /* create style element using rules from dictionary */
522 static void CreateStyleElement( TidyDocImpl* doc )
523 {
524 Lexer* lexer = doc->lexer;
525 Node *node, *head, *body;
526 TagStyle *style;
527 AttVal *av;
528
529 if ( lexer->styles == NULL && NiceBody(doc) )
530 return;
531
532 node = NewNode( lexer );
533 node->type = StartTag;
534 node->implicit = yes;
535 node->element = tmbstrdup("style");
536 FindTag( doc, node );
537
538 /* insert type attribute */
539 av = NewAttributeEx( doc, "type", "text/css", '"' );
540 InsertAttributeAtStart( node, av );
541
542 body = FindBody( doc );
543 lexer->txtstart = lexer->lexsize;
544 if ( body )
545 CleanBodyAttrs( doc, body );
546
547 for (style = lexer->styles; style; style = style->next)
548 {
549 AddCharToLexer(lexer, ' ');
550 AddStringLiteral(lexer, style->tag);
551 AddCharToLexer(lexer, '.');
552 AddStringLiteral(lexer, style->tag_class);
553 AddCharToLexer(lexer, ' ');
554 AddCharToLexer(lexer, '{');
555 AddStringLiteral(lexer, style->properties);
556 AddCharToLexer(lexer, '}');
557 AddCharToLexer(lexer, '\n');
558 }
559
560 lexer->txtend = lexer->lexsize;
561
562 InsertNodeAtEnd( node, TextToken(lexer) );
563
564 /*
565 now insert style element into document head
566
567 doc is root node. search its children for html node
568 the head node should be first child of html node
569 */
570 if ( NULL != (head = FindHEAD( doc )) )
571 InsertNodeAtEnd( head, node );
572 }
573
574
575 /* ensure bidirectional links are consistent */
576 void FixNodeLinks(Node *node)
577 {
578 Node *child;
579
580 if (node->prev)
581 node->prev->next = node;
582 else
583 node->parent->content = node;
584
585 if (node->next)
586 node->next->prev = node;
587 else
588 node->parent->last = node;
589
590 for (child = node->content; child; child = child->next)
591 child->parent = node;
592 }
593
594 /*
595 used to strip child of node when
596 the node has one and only one child
597 */
598 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
599 {
600 Node *child;
601
602 child = node->content;
603 node->content = child->content;
604 node->last = child->last;
605 child->content = NULL;
606 FreeNode(doc, child);
607
608 for (child = node->content; child; child = child->next)
609 child->parent = node;
610 }
611
612 /*
613 used to strip font start and end tags.
614 Extricate "element", replace it by its content and delete it.
615 */
616 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
617 {
618 if (element->content)
619 {
620 Node *node, *parent = element->parent;
621
622 element->last->next = element->next;
623
624 if (element->next)
625 {
626 element->next->prev = element->last;
627 }
628 else
629 parent->last = element->last;
630
631 if (element->prev)
632 {
633 element->content->prev = element->prev;
634 element->prev->next = element->content;
635 }
636 else
637 parent->content = element->content;
638
639 for (node = element->content; node; node = node->next)
640 node->parent = parent;
641
642 *pnode = element->content;
643
644 element->next = element->content = NULL;
645 FreeNode(doc, element);
646 }
647 else
648 {
649 *pnode = DiscardElement(doc, element);
650 }
651 }
652
653 /*
654 Create new string that consists of the
655 combined style properties in s1 and s2
656
657 To merge property lists, we build a linked
658 list of property/values and insert properties
659 into the list in order, merging values for
660 the same property name.
661 */
662 static tmbstr MergeProperties( ctmbstr s1, ctmbstr s2 )
663 {
664 tmbstr s;
665 StyleProp *prop;
666
667 prop = CreateProps(NULL, s1);
668 prop = CreateProps(prop, s2);
669 s = CreatePropString(prop);
670 FreeStyleProps(prop);
671 return s;
672 }
673
674 /*
675 Add style property to element, creating style
676 attribute as needed and adding ; delimiter
677 */
678 static void AddStyleProperty(TidyDocImpl* doc, Node *node, ctmbstr property )
679 {
680 AttVal *av = AttrGetById(node, TidyAttr_STYLE);
681
682 /* if style attribute already exists then insert property */
683
684 if ( av )
685 {
686 if (av->value != NULL)
687 {
688 tmbstr s = MergeProperties( av->value, property );
689 MemFree( av->value );
690 av->value = s;
691 }
692 else
693 {
694 av->value = tmbstrdup( property );
695 }
696 }
697 else /* else create new style attribute */
698 {
699 av = NewAttributeEx( doc, "style", property, '"' );
700 InsertAttributeAtStart( node, av );
701 }
702 }
703
704 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
705 {
706 AttVal *av;
707 tmbstr s1, s2, names;
708
709 for (s2 = NULL, av = child->attributes; av; av = av->next)
710 {
711 if (attrIsCLASS(av))
712 {
713 s2 = av->value;
714 break;
715 }
716 }
717
718 for (s1 = NULL, av = node->attributes; av; av = av->next)
719 {
720 if (attrIsCLASS(av))
721 {
722 s1 = av->value;
723 break;
724 }
725 }
726
727 if (s1)
728 {
729 if (s2) /* merge class names from both */
730 {
731 uint l1, l2;
732 l1 = tmbstrlen(s1);
733 l2 = tmbstrlen(s2);
734 names = (tmbstr) MemAlloc(l1 + l2 + 2);
735 tmbstrcpy(names, s1);
736 names[l1] = ' ';
737 tmbstrcpy(names+l1+1, s2);
738 MemFree(av->value);
739 av->value = names;
740 }
741 }
742 else if (s2) /* copy class names from child */
743 {
744 av = NewAttributeEx( doc, "class", s2, '"' );
745 InsertAttributeAtStart( node, av );
746 }
747 }
748
749 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
750 {
751 AttVal *av;
752 tmbstr s1, s2, style;
753
754 /*
755 the child may have a class attribute used
756 for attaching styles, if so the class name
757 needs to be copied to node's class
758 */
759 MergeClasses(doc, node, child);
760
761 for (s2 = NULL, av = child->attributes; av; av = av->next)
762 {
763 if (attrIsSTYLE(av))
764 {
765 s2 = av->value;
766 break;
767 }
768 }
769
770 for (s1 = NULL, av = node->attributes; av; av = av->next)
771 {
772 if (attrIsSTYLE(av))
773 {
774 s1 = av->value;
775 break;
776 }
777 }
778
779 if (s1)
780 {
781 if (s2) /* merge styles from both */
782 {
783 style = MergeProperties(s1, s2);
784 MemFree(av->value);
785 av->value = style;
786 }
787 }
788 else if (s2) /* copy style of child */
789 {
790 av = NewAttributeEx( doc, "style", s2, '"' );
791 InsertAttributeAtStart( node, av );
792 }
793 }
794
795 static ctmbstr FontSize2Name(ctmbstr size)
796 {
797 static const ctmbstr sizes[7] =
798 {
799 "60%", "70%", "80%", NULL,
800 "120%", "150%", "200%"
801 };
802
803 /* increment of 0.8 */
804 static const ctmbstr minussizes[] =
805 {
806 "100%", "80%", "64%", "51%",
807 "40%", "32%", "26%"
808 };
809
810 /* increment of 1.2 */
811 static const ctmbstr plussizes[] =
812 {
813 "100%", "120%", "144%", "172%",
814 "207%", "248%", "298%"
815 };
816
817 if (size[0] == '\0')
818 return NULL;
819
820 if ('' <= size[0] && size[0] <= '6')
821 {
822 int n = size[0] - '';
823 return sizes[n];
824 }
825
826 if (size[0] == '-')
827 {
828 if ('' <= size[1] && size[1] <= '6')
829 {
830 int n = size[1] - '';
831 return minussizes[n];
832 }
833 return "smaller"; /*"70%"; */
834 }
835
836 if ('' <= size[1] && size[1] <= '6')
837 {
838 int n = size[1] - '';
839 return plussizes[n];
840 }
841
842 return "larger"; /* "140%" */
843 }
844
845 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
846 {
847 tmbchar buf[256];
848 tmbsnprintf(buf, sizeof(buf), "font-family: %s", face );
849 AddStyleProperty( doc, node, buf );
850 }
851
852 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
853 {
854 ctmbstr value = NULL;
855
856 if (nodeIsP(node))
857 {
858 if (tmbstrcmp(size, "6") == 0)
859 value = "h1";
860 else if (tmbstrcmp(size, "5") == 0)
861 value = "h2";
862 else if (tmbstrcmp(size, "4") == 0)
863 value = "h3";
864
865 if (value)
866 {
867 MemFree(node->element);
868 node->element = tmbstrdup(value);
869 FindTag(doc, node);
870 return;
871 }
872 }
873
874 value = FontSize2Name(size);
875
876 if (value)
877 {
878 tmbchar buf[64];
879 tmbsnprintf(buf, sizeof(buf), "font-size: %s", value);
880 AddStyleProperty( doc, node, buf );
881 }
882 }
883
884 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
885 {
886 tmbchar buf[128];
887 tmbsnprintf(buf, sizeof(buf), "color: %s", color);
888 AddStyleProperty( doc, node, buf );
889 }
890
891 /* force alignment value to lower case */
892 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
893 {
894 uint i;
895 tmbchar buf[128];
896
897 tmbstrcpy( buf, "text-align: " );
898 for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
899 {
900 if ( (buf[i] = (tmbchar)ToLower(*align++)) == '\0' )
901 break;
902 }
903 buf[i] = '\0';
904 AddStyleProperty( doc, node, buf );
905 }
906
907 /*
908 add style properties to node corresponding to
909 the font face, size and color attributes
910 */
911 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
912 {
913 while (av)
914 {
915 if (AttrHasValue(av))
916 {
917 if (attrIsFACE(av))
918 AddFontFace( doc, node, av->value );
919 else if (attrIsSIZE(av))
920 AddFontSize( doc, node, av->value );
921 else if (attrIsCOLOR(av))
922 AddFontColor( doc, node, av->value );
923 }
924 av = av->next;
925 }
926 }
927
928 /*
929 Symptom: <p align=center>
930 Action: <p style="text-align: center">
931 */
932 static void TextAlign( TidyDocImpl* doc, Node* node )
933 {
934 AttVal *av, *prev;
935
936 prev = NULL;
937
938 for (av = node->attributes; av; av = av->next)
939 {
940 if (attrIsALIGN(av))
941 {
942 if (prev)
943 prev->next = av->next;
944 else
945 node->attributes = av->next;
946
947 if (av->value)
948 AddAlign( doc, node, av->value );
949
950 FreeAttribute(doc, av);
951 break;
952 }
953
954 prev = av;
955 }
956 }
957
958 /*
959 The clean up rules use the pnode argument to return the
960 next node when the original node has been deleted
961 */
962
963 /*
964 Symptom: <dir> <li> where <li> is only child
965 Action: coerce <dir> <li> to <div> with indent.
966 */
967
968 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
969 {
970 Node *child;
971
972 if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
973 {
974 child = node->content;
975
976 if (child == NULL)
977 return no;
978
979 /* check child has no peers */
980
981 if (child->next)
982 return no;
983
984 if ( !nodeIsLI(child) )
985 return no;
986
987 if ( !child->implicit )
988 return no;
989
990 /* coerce dir to div */
991 node->tag = LookupTagDef( TidyTag_DIV );
992 MemFree( node->element );
993 node->element = tmbstrdup("div");
994 AddStyleProperty( doc, node, "margin-left: 2em" );
995 StripOnlyChild( doc, node );
996 return yes;
997 }
998
999 return no;
1000 }
1001
1002 /*
1003 Symptom: <center>
1004 Action: replace <center> by <div style="text-align: center">
1005 */
1006
1007 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1008 {
1009 if ( nodeIsCENTER(node) )
1010 {
1011 if ( cfgBool(doc, TidyDropFontTags) )
1012 {
1013 if (node->content)
1014 {
1015 Node *last = node->last;
1016 DiscardContainer( doc, node, pnode );
1017
1018 node = InferredTag(doc, TidyTag_BR);
1019 InsertNodeAfterElement(last, node);
1020 }
1021 else
1022 {
1023 Node *prev = node->prev, *next = node->next,
1024 *parent = node->parent;
1025 DiscardContainer( doc, node, pnode );
1026
1027 node = InferredTag(doc, TidyTag_BR);
1028 if (next)
1029 InsertNodeBeforeElement(next, node);
1030 else if (prev)
1031 InsertNodeAfterElement(prev, node);
1032 else
1033 InsertNodeAtStart(parent, node);
1034 }
1035
1036 return yes;
1037 }
1038
1039 RenameElem( node, TidyTag_DIV );
1040 AddStyleProperty( doc, node, "text-align: center" );
1041 return yes;
1042 }
1043
1044 return no;
1045 }
1046
1047 /* Copy child attributes to node. Duplicate attributes are overwritten.
1048 Unique attributes (such as ID) disable the action.
1049 Attributes style and class are not dealt with. A call to MergeStyles
1050 will do that.
1051 */
1052 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1053 {
1054 AttVal *av1, *av2;
1055 TidyAttrId id;
1056
1057 /* Detect attributes that cannot be merged or overwritten. */
1058 if (AttrGetById(child, TidyAttr_ID) != NULL
1059 && AttrGetById(node, TidyAttr_ID) != NULL)
1060 return no;
1061
1062 /* Move child attributes to node. Attributes in node
1063 can be overwritten or merged. */
1064 for (av2 = child->attributes; av2; )
1065 {
1066 /* Dealt by MergeStyles. */
1067 if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1068 {
1069 av2 = av2->next;
1070 continue;
1071 }
1072 /* Avoid duplicates in node */
1073 if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1074 && (av1=AttrGetById(node, id))!= NULL)
1075 RemoveAttribute( doc, node, av1 );
1076
1077 /* Move attribute from child to node */
1078 DetachAttribute( child, av2 );
1079 av1 = av2;
1080 av2 = av2->next;
1081 av1->next = NULL;
1082 InsertAttributeAtEnd( node, av1 );
1083 }
1084
1085 return yes;
1086 }
1087
1088 /*
1089 Symptom <XX><XX>...</XX></XX>
1090 Action: merge the two XXs
1091
1092 For instance, this is useful after nested <dir>s used by Word
1093 for indenting have been converted to <div>s
1094
1095 If state is "no", no merging.
1096 If state is "yes", inner element is discarded. Only Style and Class
1097 attributes are merged using MergeStyles().
1098 If state is "auto", atttibutes are merged as described in CopyAttrs().
1099 Style and Class attributes are merged using MergeStyles().
1100 */
1101 static Bool MergeNestedElements( TidyDocImpl* doc,
1102 TidyTagId Id, TidyTriState state, Node *node,
1103 Node **ARG_UNUSED(pnode))
1104 {
1105 Node *child;
1106
1107 if ( state == TidyNoState
1108 || !TagIsId(node, Id) )
1109 return no;
1110
1111 child = node->content;
1112
1113 if ( child == NULL
1114 || child->next != NULL
1115 || !TagIsId(child, Id) )
1116 return no;
1117
1118 if ( state == TidyAutoState
1119 && CopyAttrs(doc, node, child) == no )
1120 return no;
1121
1122 MergeStyles( doc, node, child );
1123 StripOnlyChild( doc, node );
1124 return yes;
1125 }
1126
1127 /*
1128 Symptom: <ul><li><ul>...</ul></li></ul>
1129 Action: discard outer list
1130 */
1131
1132 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1133 {
1134 Node *child, *list;
1135
1136 if ( nodeIsUL(node) || nodeIsOL(node) )
1137 {
1138 child = node->content;
1139
1140 if (child == NULL)
1141 return no;
1142
1143 /* check child has no peers */
1144
1145 if (child->next)
1146 return no;
1147
1148 list = child->content;
1149
1150 if (!list)
1151 return no;
1152
1153 if (list->tag != node->tag)
1154 return no;
1155
1156 /* check list has no peers */
1157 if (list->next)
1158 return no;
1159
1160 *pnode = list; /* Set node to resume iteration */
1161
1162 /* move inner list node into position of outer node */
1163 list->prev = node->prev;
1164 list->next = node->next;
1165 list->parent = node->parent;
1166 FixNodeLinks(list);
1167
1168 /* get rid of outer ul and its li */
1169 child->content = NULL;
1170 FreeNode( doc, child ); /* See test #427841. */
1171 child = NULL;
1172 node->content = NULL;
1173 node->next = NULL;
1174 FreeNode( doc, node );
1175 node = NULL;
1176
1177 /*
1178 If prev node was a list the chances are this node
1179 should be appended to that list. Word has no way of
1180 recognizing nested lists and just uses indents
1181 */
1182
1183 if (list->prev)
1184 {
1185 if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1186 && list->prev->last )
1187 {
1188 node = list;
1189 list = node->prev;
1190
1191 child = list->last; /* <li> */
1192
1193 list->next = node->next;
1194 FixNodeLinks(list);
1195
1196 node->parent = child;
1197 node->next = NULL;
1198 node->prev = child->last;
1199 FixNodeLinks(node);
1200 CleanNode( doc, node );
1201 }
1202 }
1203
1204 return yes;
1205 }
1206
1207 return no;
1208 }
1209
1210 /*
1211 Some necessary conditions to apply BlockStyle().
1212 */
1213
1214 static Bool CanApplyBlockStyle( Node *node )
1215 {
1216 if (node->tag->model & (CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1217 && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1218 {
1219 return yes;
1220 }
1221 return no;
1222 }
1223
1224 /*
1225 Symptom: the only child of a block-level element is a
1226 presentation element such as B, I or FONT
1227
1228 Action: add style "font-weight: bold" to the block and
1229 strip the <b> element, leaving its children.
1230
1231 example:
1232
1233 <p>
1234 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1235 </p>
1236
1237 becomes:
1238
1239 <p style="font-weight: bold; font-family: Arial; font-size: 6">
1240 Draft Recommended Practice
1241 </p>
1242
1243 This code also replaces the align attribute by a style attribute.
1244 However, to avoid CSS problems with Navigator 4, this isn't done
1245 for the elements: caption, tr and table
1246 */
1247 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1248 {
1249 Node *child;
1250
1251 if (CanApplyBlockStyle(node))
1252 {
1253 /* check for align attribute */
1254 if ( !nodeIsCAPTION(node) )
1255 TextAlign( doc, node );
1256
1257 child = node->content;
1258 if (child == NULL)
1259 return no;
1260
1261 /* check child has no peers */
1262 if (child->next)
1263 return no;
1264
1265 if ( nodeIsB(child) )
1266 {
1267 MergeStyles( doc, node, child );
1268 AddStyleProperty( doc, node, "font-weight: bold" );
1269 StripOnlyChild( doc, node );
1270 return yes;
1271 }
1272
1273 if ( nodeIsI(child) )
1274 {
1275 MergeStyles( doc, node, child );
1276 AddStyleProperty( doc, node, "font-style: italic" );
1277 StripOnlyChild( doc, node );
1278 return yes;
1279 }
1280
1281 if ( nodeIsFONT(child) )
1282 {
1283 MergeStyles( doc, node, child );
1284 AddFontStyles( doc, node, child->attributes );
1285 StripOnlyChild( doc, node );
1286 return yes;
1287 }
1288 }
1289
1290 return no;
1291 }
1292
1293 /* the only child of table cell or an inline element such as em */
1294 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1295 {
1296 Node *child;
1297
1298 if ( !nodeIsFONT(node) && nodeHasCM(node, CM_INLINE|CM_ROW) )
1299 {
1300 child = node->content;
1301
1302 if (child == NULL)
1303 return no;
1304
1305 /* check child has no peers */
1306
1307 if (child->next)
1308 return no;
1309
1310 if ( nodeIsB(child) && cfgBool(doc, TidyLogicalEmphasis) )
1311 {
1312 MergeStyles( doc, node, child );
1313 AddStyleProperty( doc, node, "font-weight: bold" );
1314 StripOnlyChild( doc, node );
1315 return yes;
1316 }
1317
1318 if ( nodeIsI(child) && cfgBool(doc, TidyLogicalEmphasis) )
1319 {
1320 MergeStyles( doc, node, child );
1321 AddStyleProperty( doc, node, "font-style: italic" );
1322 StripOnlyChild( doc, node );
1323 return yes;
1324 }
1325
1326 if ( nodeIsFONT(child) )
1327 {
1328 MergeStyles( doc, node, child );
1329 AddFontStyles( doc, node, child->attributes );
1330 StripOnlyChild( doc, node );
1331 return yes;
1332 }
1333 }
1334
1335 return no;
1336 }
1337
1338 /*
1339 Replace font elements by span elements, deleting
1340 the font element's attributes and replacing them
1341 by a single style attribute.
1342 */
1343 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1344 {
1345 AttVal *av, *style, *next;
1346
1347 if ( nodeIsFONT(node) )
1348 {
1349 if ( cfgBool(doc, TidyDropFontTags) )
1350 {
1351 DiscardContainer( doc, node, pnode );
1352 return yes;
1353 }
1354
1355 /* if FONT is only child of parent element then leave alone
1356 Do so only if BlockStyle may be succesful. */
1357 if ( node->parent->content == node && node->next == NULL &&
1358 CanApplyBlockStyle(node->parent) )
1359 return no;
1360
1361 AddFontStyles( doc, node, node->attributes );
1362
1363 /* extract style attribute and free the rest */
1364 av = node->attributes;
1365 style = NULL;
1366
1367 while (av)
1368 {
1369 next = av->next;
1370
1371 if (attrIsSTYLE(av))
1372 {
1373 av->next = NULL;
1374 style = av;
1375 }
1376 else
1377 {
1378 FreeAttribute( doc, av );
1379 }
1380 av = next;
1381 }
1382
1383 node->attributes = style;
1384 RenameElem( node, TidyTag_SPAN );
1385 return yes;
1386 }
1387
1388 return no;
1389 }
1390
1391 /*
1392 Applies all matching rules to a node.
1393 */
1394 Node* CleanNode( TidyDocImpl* doc, Node *node )
1395 {
1396 Node *next = NULL;
1397 TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1398
1399 for (next = node; nodeIsElement(node); node = next)
1400 {
1401 if ( Dir2Div(doc, node, &next) )
1402 continue;
1403
1404 /* Special case: true result means
1405 ** that arg node and its parent no longer exist.
1406 ** So we must jump back up the CreateStyleProperties()
1407 ** call stack until we have a valid node reference.
1408 */
1409 if ( NestedList(doc, node, &next) )
1410 return next;
1411
1412 if ( Center2Div(doc, node, &next) )
1413 continue;
1414
1415 if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1416 continue;
1417
1418 if ( BlockStyle(doc, node, &next) )
1419 continue;
1420
1421 if ( InlineStyle(doc, node, &next) )
1422 continue;
1423
1424 if ( Font2Span(doc, node, &next) )
1425 continue;
1426
1427 break;
1428 }