Version:
~ [ 1.0 ] ~
1 /*
2 pprint.c -- pretty print parse tree
3
4 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
5 See tidy.h for the copyright notice.
6
7 CVS Info :
8
9 $Author: arnaud02 $
10 $Date: 2005/08/02 10:07:29 $
11 $Revision: 1.104 $
12
13 */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18
19 #include "pprint.h"
20 #include "tidy-int.h"
21 #include "parser.h"
22 #include "entities.h"
23 #include "tmbstr.h"
24 #include "utf8.h"
25
26 /*
27 Block-level and unknown elements are printed on
28 new lines and their contents indented 2 spaces
29
30 Inline elements are printed inline.
31
32 Inline content is wrapped on spaces (except in
33 attribute values or preformatted text, after
34 start tags and before end tags
35 */
36
37 static void PPrintAsp( TidyDocImpl* doc, uint indent, Node* node );
38 static void PPrintJste( TidyDocImpl* doc, uint indent, Node* node );
39 static void PPrintPhp( TidyDocImpl* doc, uint indent, Node* node );
40 static int TextEndsWithNewline( Lexer *lexer, Node *node, uint mode );
41 static int TextStartsWithWhitespace( Lexer *lexer, Node *node, uint start, uint mode );
42 static Bool InsideHead( TidyDocImpl* doc, Node *node );
43 static Bool ShouldIndent( TidyDocImpl* doc, Node *node );
44
45 #if SUPPORT_ASIAN_ENCODINGS
46 /* #431953 - start RJ Wraplen adjusted for smooth international ride */
47
48 uint CWrapLen( TidyDocImpl* doc, uint ind )
49 {
50 ctmbstr lang = cfgStr( doc, TidyLanguage );
51 uint wraplen = cfg( doc, TidyWrapLen );
52
53 if ( !tmbstrcasecmp(lang, "zh") )
54 /* Chinese characters take two positions on a fixed-width screen */
55 /* It would be more accurate to keep a parallel linelen and wraphere
56 incremented by 2 for Chinese characters and 1 otherwise, but this
57 is way simpler.
58 */
59 return (ind + (( wraplen - ind ) / 2)) ;
60
61 if ( !tmbstrcasecmp(lang, "ja") )
62 /* average Japanese text is 30% kanji */
63 return (ind + ((( wraplen - ind ) * 7) / 10)) ;
64
65 return wraplen;
66 }
67
68 typedef enum
69 {
70 UC00, /* None */
71 UCPC, /* Punctuation, Connector */
72 UCPD, /* Punctuation, Dash */
73 UCPE, /* Punctuation, Close */
74 UCPS, /* Punctuation, Open */
75 UCPI, /* Punctuation, Initial quote */
76 UCPF, /* Punctuation, Final quote */
77 UCPO, /* Punctuation, Other */
78 UCZS, /* Separator, Space */
79 UCZL, /* Separator, Line */
80 UCZP /* Separator, Paragraph */
81 } UnicodeCategory;
82
83 /*
84 From the original code, the following characters are removed:
85
86 U+2011 (non-breaking hyphen)
87 U+202F (narrow non-break space)
88 U+2044 (fraction slash)
89 U+200B (zero width space)
90 ...... (bidi formatting control characters)
91
92 U+2011 and U+202F are non-breaking, U+2044 is a Sm character,
93 U+200B is a non-visible space, wrapping after it would make
94 this space visible, bidi should be done using HTML features
95 and the characters are neither Px or Zx.
96
97 The following Unicode 3.0 punctuation characters are added:
98
99 U+2048 (question exclamation mark)
100 U+2049 (exclamation question mark)
101 U+204A (tironian sign et)
102 U+204B (reversed pilcrow sign)
103 U+204C (black leftwards bullet)
104 U+204D (black rightwards bullet)
105 U+3030 (wavy dash)
106 U+30FB (katakana middle dot)
107 U+FE63 (small hyphen-minus)
108 U+FE68 (small reverse solidus)
109 U+FF3F (fullwidth low line)
110 U+FF5B (fullwidth left curly bracket)
111 U+FF5D (fullwidth right curly bracket)
112
113 Other additional characters were not included in Unicode 3.0.
114 The table is based on Unicode 4.0. It must include only those
115 characters marking a wrapping point, "before" if the general
116 category is UCPS or UCPI, otherwise "after".
117 */
118 static struct _unicode4cat
119 {
120 unsigned long code;
121 UnicodeCategory category;
122 } const unicode4cat[] =
123 {
124 #if 0
125 { 0x037E, UCPO }, { 0x0387, UCPO }, { 0x055A, UCPO }, { 0x055B, UCPO },
126 { 0x055C, UCPO }, { 0x055D, UCPO }, { 0x055E, UCPO }, { 0x055F, UCPO },
127 { 0x0589, UCPO }, { 0x058A, UCPD }, { 0x05BE, UCPO }, { 0x05C0, UCPO },
128 { 0x05C3, UCPO }, { 0x05F3, UCPO }, { 0x05F4, UCPO }, { 0x060C, UCPO },
129 { 0x060D, UCPO }, { 0x061B, UCPO }, { 0x061F, UCPO }, { 0x066A, UCPO },
130 { 0x066B, UCPO }, { 0x066C, UCPO }, { 0x066D, UCPO }, { 0x06D4, UCPO },
131 { 0x0700, UCPO }, { 0x0701, UCPO }, { 0x0702, UCPO }, { 0x0703, UCPO },
132 { 0x0704, UCPO }, { 0x0705, UCPO }, { 0x0706, UCPO }, { 0x0707, UCPO },
133 { 0x0708, UCPO }, { 0x0709, UCPO }, { 0x070A, UCPO }, { 0x070B, UCPO },
134 { 0x070C, UCPO }, { 0x070D, UCPO }, { 0x0964, UCPO }, { 0x0965, UCPO },
135 { 0x0970, UCPO }, { 0x0DF4, UCPO }, { 0x0E4F, UCPO }, { 0x0E5A, UCPO },
136 { 0x0E5B, UCPO }, { 0x0F04, UCPO }, { 0x0F05, UCPO }, { 0x0F06, UCPO },
137 { 0x0F07, UCPO }, { 0x0F08, UCPO }, { 0x0F09, UCPO }, { 0x0F0A, UCPO },
138 { 0x0F0B, UCPO }, { 0x0F0D, UCPO }, { 0x0F0E, UCPO }, { 0x0F0F, UCPO },
139 { 0x0F10, UCPO }, { 0x0F11, UCPO }, { 0x0F12, UCPO }, { 0x0F3A, UCPS },
140 { 0x0F3B, UCPE }, { 0x0F3C, UCPS }, { 0x0F3D, UCPE }, { 0x0F85, UCPO },
141 { 0x104A, UCPO }, { 0x104B, UCPO }, { 0x104C, UCPO }, { 0x104D, UCPO },
142 { 0x104E, UCPO }, { 0x104F, UCPO }, { 0x10FB, UCPO }, { 0x1361, UCPO },
143 { 0x1362, UCPO }, { 0x1363, UCPO }, { 0x1364, UCPO }, { 0x1365, UCPO },
144 { 0x1366, UCPO }, { 0x1367, UCPO }, { 0x1368, UCPO }, { 0x166D, UCPO },
145 { 0x166E, UCPO }, { 0x1680, UCZS }, { 0x169B, UCPS }, { 0x169C, UCPE },
146 { 0x16EB, UCPO }, { 0x16EC, UCPO }, { 0x16ED, UCPO }, { 0x1735, UCPO },
147 { 0x1736, UCPO }, { 0x17D4, UCPO }, { 0x17D5, UCPO }, { 0x17D6, UCPO },
148 { 0x17D8, UCPO }, { 0x17D9, UCPO }, { 0x17DA, UCPO }, { 0x1800, UCPO },
149 { 0x1801, UCPO }, { 0x1802, UCPO }, { 0x1803, UCPO }, { 0x1804, UCPO },
150 { 0x1805, UCPO }, { 0x1806, UCPD }, { 0x1807, UCPO }, { 0x1808, UCPO },
151 { 0x1809, UCPO }, { 0x180A, UCPO }, { 0x180E, UCZS }, { 0x1944, UCPO },
152 { 0x1945, UCPO },
153 #endif
154 { 0x2000, UCZS }, { 0x2001, UCZS }, { 0x2002, UCZS }, { 0x2003, UCZS },
155 { 0x2004, UCZS }, { 0x2005, UCZS }, { 0x2006, UCZS }, { 0x2008, UCZS },
156 { 0x2009, UCZS }, { 0x200A, UCZS }, { 0x2010, UCPD }, { 0x2012, UCPD },
157 { 0x2013, UCPD }, { 0x2014, UCPD }, { 0x2015, UCPD }, { 0x2016, UCPO },
158 { 0x2017, UCPO }, { 0x2018, UCPI }, { 0x2019, UCPF }, { 0x201A, UCPS },
159 { 0x201B, UCPI }, { 0x201C, UCPI }, { 0x201D, UCPF }, { 0x201E, UCPS },
160 { 0x201F, UCPI }, { 0x2020, UCPO }, { 0x2021, UCPO }, { 0x2022, UCPO },
161 { 0x2023, UCPO }, { 0x2024, UCPO }, { 0x2025, UCPO }, { 0x2026, UCPO },
162 { 0x2027, UCPO }, { 0x2028, UCZL }, { 0x2029, UCZP }, { 0x2030, UCPO },
163 { 0x2031, UCPO }, { 0x2032, UCPO }, { 0x2033, UCPO }, { 0x2034, UCPO },
164 { 0x2035, UCPO }, { 0x2036, UCPO }, { 0x2037, UCPO }, { 0x2038, UCPO },
165 { 0x2039, UCPI }, { 0x203A, UCPF }, { 0x203B, UCPO }, { 0x203C, UCPO },
166 { 0x203D, UCPO }, { 0x203E, UCPO }, { 0x203F, UCPC }, { 0x2040, UCPC },
167 { 0x2041, UCPO }, { 0x2042, UCPO }, { 0x2043, UCPO }, { 0x2045, UCPS },
168 { 0x2046, UCPE }, { 0x2047, UCPO }, { 0x2048, UCPO }, { 0x2049, UCPO },
169 { 0x204A, UCPO }, { 0x204B, UCPO }, { 0x204C, UCPO }, { 0x204D, UCPO },
170 { 0x204E, UCPO }, { 0x204F, UCPO }, { 0x2050, UCPO }, { 0x2051, UCPO },
171 { 0x2053, UCPO }, { 0x2054, UCPC }, { 0x2057, UCPO }, { 0x205F, UCZS },
172 { 0x207D, UCPS }, { 0x207E, UCPE }, { 0x208D, UCPS }, { 0x208E, UCPE },
173 { 0x2329, UCPS }, { 0x232A, UCPE }, { 0x23B4, UCPS }, { 0x23B5, UCPE },
174 { 0x23B6, UCPO }, { 0x2768, UCPS }, { 0x2769, UCPE }, { 0x276A, UCPS },
175 { 0x276B, UCPE }, { 0x276C, UCPS }, { 0x276D, UCPE }, { 0x276E, UCPS },
176 { 0x276F, UCPE }, { 0x2770, UCPS }, { 0x2771, UCPE }, { 0x2772, UCPS },
177 { 0x2773, UCPE }, { 0x2774, UCPS }, { 0x2775, UCPE }, { 0x27E6, UCPS },
178 { 0x27E7, UCPE }, { 0x27E8, UCPS }, { 0x27E9, UCPE }, { 0x27EA, UCPS },
179 { 0x27EB, UCPE }, { 0x2983, UCPS }, { 0x2984, UCPE }, { 0x2985, UCPS },
180 { 0x2986, UCPE }, { 0x2987, UCPS }, { 0x2988, UCPE }, { 0x2989, UCPS },
181 { 0x298A, UCPE }, { 0x298B, UCPS }, { 0x298C, UCPE }, { 0x298D, UCPS },
182 { 0x298E, UCPE }, { 0x298F, UCPS }, { 0x2990, UCPE }, { 0x2991, UCPS },
183 { 0x2992, UCPE }, { 0x2993, UCPS }, { 0x2994, UCPE }, { 0x2995, UCPS },
184 { 0x2996, UCPE }, { 0x2997, UCPS }, { 0x2998, UCPE }, { 0x29D8, UCPS },
185 { 0x29D9, UCPE }, { 0x29DA, UCPS }, { 0x29DB, UCPE }, { 0x29FC, UCPS },
186 { 0x29FD, UCPE }, { 0x3001, UCPO }, { 0x3002, UCPO }, { 0x3003, UCPO },
187 { 0x3008, UCPS }, { 0x3009, UCPE }, { 0x300A, UCPS }, { 0x300B, UCPE },
188 { 0x300C, UCPS }, { 0x300D, UCPE }, { 0x300E, UCPS }, { 0x300F, UCPE },
189 { 0x3010, UCPS }, { 0x3011, UCPE }, { 0x3014, UCPS }, { 0x3015, UCPE },
190 { 0x3016, UCPS }, { 0x3017, UCPE }, { 0x3018, UCPS }, { 0x3019, UCPE },
191 { 0x301A, UCPS }, { 0x301B, UCPE }, { 0x301C, UCPD }, { 0x301D, UCPS },
192 { 0x301E, UCPE }, { 0x301F, UCPE }, { 0x3030, UCPD }, { 0x303D, UCPO },
193 { 0x30A0, UCPD }, { 0x30FB, UCPC }, { 0xFD3E, UCPS }, { 0xFD3F, UCPE },
194 { 0xFE30, UCPO }, { 0xFE31, UCPD }, { 0xFE32, UCPD }, { 0xFE33, UCPC },
195 { 0xFE34, UCPC }, { 0xFE35, UCPS }, { 0xFE36, UCPE }, { 0xFE37, UCPS },
196 { 0xFE38, UCPE }, { 0xFE39, UCPS }, { 0xFE3A, UCPE }, { 0xFE3B, UCPS },
197 { 0xFE3C, UCPE }, { 0xFE3D, UCPS }, { 0xFE3E, UCPE }, { 0xFE3F, UCPS },
198 { 0xFE40, UCPE }, { 0xFE41, UCPS }, { 0xFE42, UCPE }, { 0xFE43, UCPS },
199 { 0xFE44, UCPE }, { 0xFE45, UCPO }, { 0xFE46, UCPO }, { 0xFE47, UCPS },
200 { 0xFE48, UCPE }, { 0xFE49, UCPO }, { 0xFE4A, UCPO }, { 0xFE4B, UCPO },
201 { 0xFE4C, UCPO }, { 0xFE4D, UCPC }, { 0xFE4E, UCPC }, { 0xFE4F, UCPC },
202 { 0xFE50, UCPO }, { 0xFE51, UCPO }, { 0xFE52, UCPO }, { 0xFE54, UCPO },
203 { 0xFE55, UCPO }, { 0xFE56, UCPO }, { 0xFE57, UCPO }, { 0xFE58, UCPD },
204 { 0xFE59, UCPS }, { 0xFE5A, UCPE }, { 0xFE5B, UCPS }, { 0xFE5C, UCPE },
205 { 0xFE5D, UCPS }, { 0xFE5E, UCPE }, { 0xFE5F, UCPO }, { 0xFE60, UCPO },
206 { 0xFE61, UCPO }, { 0xFE63, UCPD }, { 0xFE68, UCPO }, { 0xFE6A, UCPO },
207 { 0xFE6B, UCPO }, { 0xFF01, UCPO }, { 0xFF02, UCPO }, { 0xFF03, UCPO },
208 { 0xFF05, UCPO }, { 0xFF06, UCPO }, { 0xFF07, UCPO }, { 0xFF08, UCPS },
209 { 0xFF09, UCPE }, { 0xFF0A, UCPO }, { 0xFF0C, UCPO }, { 0xFF0D, UCPD },
210 { 0xFF0E, UCPO }, { 0xFF0F, UCPO }, { 0xFF1A, UCPO }, { 0xFF1B, UCPO },
211 { 0xFF1F, UCPO }, { 0xFF20, UCPO }, { 0xFF3B, UCPS }, { 0xFF3C, UCPO },
212 { 0xFF3D, UCPE }, { 0xFF3F, UCPC }, { 0xFF5B, UCPS }, { 0xFF5D, UCPE },
213 { 0xFF5F, UCPS }, { 0xFF60, UCPE }, { 0xFF61, UCPO }, { 0xFF62, UCPS },
214 { 0xFF63, UCPE }, { 0xFF64, UCPO }, { 0xFF65, UCPC }, { 0x10100,UCPO },
215 { 0x10101,UCPO }, { 0x1039F,UCPO },
216
217 /* final entry */
218 { 0x0000, UC00 }
219 };
220
221 typedef enum
222 {
223 NoWrapPoint,
224 WrapBefore,
225 WrapAfter
226 } WrapPoint;
227
228 /*
229 If long lines of text have no white space as defined in HTML 4
230 (U+0009, U+000A, U+000D, U+000C, U+0020) other characters could
231 be used to determine a wrap point. Since user agents would
232 normalize the inserted newline character to a space character,
233 this wrapping behaviour would insert visual whitespace into the
234 document.
235
236 Characters of the General Category Pi and Ps in the Unicode
237 character database (opening punctuation and intial quote
238 characters) mark a wrapping point before the character, other
239 punctuation characters (Pc, Pd, Pe, Pf, and Po), breakable
240 space characters (Zs), and paragraph and line separators
241 (Zl, Zp) mark a wrap point after the character. Using this
242 function Tidy can for example pretty print
243
244 <p>....................“...quote...”...</p>
245 as
246 <p>....................\n“...quote...”...</p>
247 or
248 <p>....................“...quote...”\n...</p>
249
250 if the next normal wrapping point would exceed the user
251 chosen wrapping column.
252 */
253 static WrapPoint CharacterWrapPoint(tchar c)
254 {
255 int i;
256 for (i = 0; unicode4cat[i].code && unicode4cat[i].code <= c; ++i)
257 if (unicode4cat[i].code == c)
258 /* wrapping before opening punctuation and initial quotes */
259 if (unicode4cat[i].category == UCPS ||
260 unicode4cat[i].category == UCPI)
261 return WrapBefore;
262 /* else wrapping after this character */
263 else
264 return WrapAfter;
265 /* character has no effect on line wrapping */
266 return NoWrapPoint;
267 }
268
269 static WrapPoint Big5WrapPoint(tchar c)
270 {
271 if ((c & 0xFF00) == 0xA100)
272 {
273 /* opening brackets have odd codes: break before them */
274 if ( c > 0x5C && c < 0xAD && (c & 1) == 1 )
275 return WrapBefore;
276 return WrapAfter;
277 }
278 return NoWrapPoint;
279 }
280
281 #endif /* SUPPORT_ASIAN_ENCODINGS */
282
283 static void InitIndent( TidyIndent* ind )
284 {
285 ind->spaces = -1;
286 ind->attrValStart = -1;
287 ind->attrStringStart = -1;
288 }
289
290 void InitPrintBuf( TidyDocImpl* doc )
291 {
292 ClearMemory( &doc->pprint, sizeof(TidyPrintImpl) );
293 InitIndent( &doc->pprint.indent[0] );
294 InitIndent( &doc->pprint.indent[1] );
295 }
296
297 void FreePrintBuf( TidyDocImpl* doc )
298 {
299 MemFree( doc->pprint.linebuf );
300 InitPrintBuf( doc );
301 }
302
303 static void expand( TidyPrintImpl* pprint, uint len )
304 {
305 uint* ip;
306 uint buflen = pprint->lbufsize;
307
308 if ( buflen == 0 )
309 buflen = 256;
310 while ( len >= buflen )
311 buflen *= 2;
312
313 ip = (uint*) MemRealloc( pprint->linebuf, buflen*sizeof(uint) );
314 if ( ip )
315 {
316 ClearMemory( ip+pprint->lbufsize,
317 (buflen-pprint->lbufsize)*sizeof(uint) );
318 pprint->lbufsize = buflen;
319 pprint->linebuf = ip;
320 }
321 }
322
323 static uint GetSpaces( TidyPrintImpl* pprint )
324 {
325 int spaces = pprint->indent[ 0 ].spaces;
326 return ( spaces < 0 ? 0U : (uint) spaces );
327 }
328 static int ClearInString( TidyPrintImpl* pprint )
329 {
330 TidyIndent *ind = pprint->indent + pprint->ixInd;
331 return ind->attrStringStart = -1;
332 }
333 static int ToggleInString( TidyPrintImpl* pprint )
334 {
335 TidyIndent *ind = pprint->indent + pprint->ixInd;
336 Bool inString = ( ind->attrStringStart >= 0 );
337 return ind->attrStringStart = ( inString ? -1 : (int) pprint->linelen );
338 }
339 static Bool IsInString( TidyPrintImpl* pprint )
340 {
341 TidyIndent *ind = pprint->indent + 0; /* Always 1st */
342 return ( ind->attrStringStart >= 0 &&
343 ind->attrStringStart < (int) pprint->linelen );
344 }
345 static Bool IsWrapInString( TidyPrintImpl* pprint )
346 {
347 TidyIndent *ind = pprint->indent + 0; /* Always 1st */
348 int wrap = (int) pprint->wraphere;
349 return ( ind->attrStringStart == 0 ||
350 (ind->attrStringStart > 0 && ind->attrStringStart < wrap) );
351 }
352
353 static Bool HasMixedContent (Node *element)
354 {
355 Node * node;
356
357 if (!element)
358 return no;
359
360 for (node = element->content; node; node = node->next)
361 if ( nodeIsText(node) )
362 return yes;
363
364 return no;
365 }
366
367 static void ClearInAttrVal( TidyPrintImpl* pprint )
368 {
369 TidyIndent *ind = pprint->indent + pprint->ixInd;
370 ind->attrValStart = -1;
371 }
372 static int SetInAttrVal( TidyPrintImpl* pprint )
373 {
374 TidyIndent *ind = pprint->indent + pprint->ixInd;
375 return ind->attrValStart = (int) pprint->linelen;
376 }
377 static Bool IsWrapInAttrVal( TidyPrintImpl* pprint )
378 {
379 TidyIndent *ind = pprint->indent + 0; /* Always 1st */
380 int wrap = (int) pprint->wraphere;
381 return ( ind->attrValStart == 0 ||
382 (ind->attrValStart > 0 && ind->attrValStart < wrap) );
383 }
384
385 static Bool WantIndent( TidyDocImpl* doc )
386 {
387 TidyPrintImpl* pprint = &doc->pprint;
388 Bool wantIt = GetSpaces(pprint) > 0;
389 if ( wantIt )
390 {
391 Bool indentAttrs = cfgBool( doc, TidyIndentAttributes );
392 wantIt = ( ( !IsWrapInAttrVal(pprint) || indentAttrs ) &&
393 !IsWrapInString(pprint) );
394 }
395 return wantIt;
396 }
397
398
399 static uint WrapOff( TidyDocImpl* doc )
400 {
401 uint saveWrap = cfg( doc, TidyWrapLen );
402 SetOptionInt( doc, TidyWrapLen, 0xFFFFFFFF ); /* very large number */
403 return saveWrap;
404 }
405
406 static void WrapOn( TidyDocImpl* doc, uint saveWrap )
407 {
408 SetOptionInt( doc, TidyWrapLen, saveWrap );
409 }
410
411 static uint WrapOffCond( TidyDocImpl* doc, Bool onoff )
412 {
413 if ( onoff )
414 return WrapOff( doc );
415 return cfg( doc, TidyWrapLen );
416 }
417
418
419 static void AddC( TidyPrintImpl* pprint, uint c, uint string_index)
420 {
421 if ( string_index + 1 >= pprint->lbufsize )
422 expand( pprint, string_index + 1 );
423 pprint->linebuf[string_index] = c;
424 }
425
426 static uint AddChar( TidyPrintImpl* pprint, uint c )
427 {
428 AddC( pprint, c, pprint->linelen );
429 return ++pprint->linelen;
430 }
431
432 static uint AddAsciiString( TidyPrintImpl* pprint, ctmbstr str, uint string_index )
433 {
434 uint ix, len = tmbstrlen( str );
435 if ( string_index + len >= pprint->lbufsize )
436 expand( pprint, string_index + len );
437
438 for ( ix=0; ix<len; ++ix )
439 pprint->linebuf[string_index + ix] = str[ ix ];
440 return string_index + len;
441 }
442
443 static uint AddString( TidyPrintImpl* pprint, ctmbstr str )
444 {
445 return pprint->linelen = AddAsciiString( pprint, str, pprint->linelen );
446 }
447
448 /* Saves current output point as the wrap point,
449 ** but only if indentation would NOT overflow
450 ** the current line. Otherwise keep previous wrap point.
451 */
452 static Bool SetWrap( TidyDocImpl* doc, uint indent )
453 {
454 TidyPrintImpl* pprint = &doc->pprint;
455 Bool wrap = ( indent + pprint->linelen < cfg(doc, TidyWrapLen) );
456 if ( wrap )
457 {
458 if ( pprint->indent[0].spaces < 0 )
459 pprint->indent[0].spaces = indent;
460 pprint->wraphere = pprint->linelen;
461 }
462 else if ( pprint->ixInd == 0 )
463 {
464 /* Save indent 1st time we pass the the wrap line */
465 pprint->indent[ 1 ].spaces = indent;
466 pprint->ixInd = 1;
467 }
468 return wrap;
469 }
470
471 static void CarryOver( int* valTo, int* valFrom, uint wrapPoint )
472 {
473 if ( *valFrom > (int) wrapPoint )
474 {
475 *valTo = *valFrom - wrapPoint;
476 *valFrom = -1;
477 }
478 }
479
480
481 static Bool SetWrapAttr( TidyDocImpl* doc,
482 uint indent, int attrStart, int strStart )
483 {
484 TidyPrintImpl* pprint = &doc->pprint;
485 TidyIndent *ind = pprint->indent + 0;
486
487 Bool wrap = ( indent + pprint->linelen < cfg(doc, TidyWrapLen) );
488 if ( wrap )
489 {
490 if ( ind[0].spaces < 0 )
491 ind[0].spaces = indent;
492 pprint->wraphere = pprint->linelen;
493 }
494 else if ( pprint->ixInd == 0 )
495 {
496 /* Save indent 1st time we pass the the wrap line */
497 pprint->indent[ 1 ].spaces = indent;
498 pprint->ixInd = 1;
499
500 /* Carry over string state */
501 CarryOver( &ind[1].attrStringStart, &ind[0].attrStringStart, pprint->wraphere );
502 CarryOver( &ind[1].attrValStart, &ind[0].attrValStart, pprint->wraphere );
503 }
504 ind += doc->pprint.ixInd;
505 ind->attrValStart = attrStart;
506 ind->attrStringStart = strStart;
507 return wrap;
508 }
509
510
511 /* Reset indent state after flushing a new line
512 */
513 static void ResetLine( TidyPrintImpl* pprint )
514 {
515 TidyIndent* ind = pprint->indent + 0;
516 if ( pprint->ixInd > 0 )
517 {
518 ind[0] = ind[1];
519 InitIndent( &ind[1] );
520 }
521
522 if ( pprint->wraphere > 0 )
523 {
524 int wrap = (int) pprint->wraphere;
525 if ( ind[0].attrStringStart > wrap )
526 ind[0].attrStringStart -= wrap;
527 if ( ind[0].attrValStart > wrap )
528 ind[0].attrValStart -= wrap;
529 }
530 else
531 {
532 if ( ind[0].attrStringStart > 0 )
533 ind[0].attrStringStart = 0;
534 if ( ind[0].attrValStart > 0 )
535 ind[0].attrValStart = 0;
536 }
537 pprint->wraphere = pprint->ixInd = 0;
538 }
539
540 /* Shift text after wrap point to
541 ** beginning of next line.
542 */
543 static void ResetLineAfterWrap( TidyPrintImpl* pprint )
544 {
545 if ( pprint->linelen > pprint->wraphere )
546 {
547 uint *p = pprint->linebuf;
548 uint *q = p + pprint->wraphere;
549 uint *end = p + pprint->linelen;
550
551 if ( ! IsWrapInAttrVal(pprint) )
552 {
553 while ( q < end && *q == ' ' )
554 ++q, ++pprint->wraphere;
555 }
556
557 while ( q < end )
558 *p++ = *q++;
559
560 pprint->linelen -= pprint->wraphere;
561 }
562 else
563 {
564 pprint->linelen = 0;
565 }
566
567 ResetLine( pprint );
568 }
569
570 /* Goes ahead with writing current line up to
571 ** previously saved wrap point. Shifts unwritten
572 ** text in output buffer to beginning of next line.
573 */
574 static void WrapLine( TidyDocImpl* doc )
575 {
576 TidyPrintImpl* pprint = &doc->pprint;
577 uint i;
578
579 if ( pprint->wraphere == 0 )
580 return;
581
582 if ( WantIndent(doc) )
583 {
584 uint spaces = GetSpaces( pprint );
585 for ( i = 0; i < spaces; ++i )
586 WriteChar( ' ', doc->docOut );
587 }
588
589 for ( i = 0; i < pprint->wraphere; ++i )
590 WriteChar( pprint->linebuf[i], doc->docOut );
591
592 if ( IsWrapInString(pprint) )
593 WriteChar( '\\', doc->docOut );
594
595 WriteChar( '\n', doc->docOut );
596 ResetLineAfterWrap( pprint );
597 }
598
599 /* Checks current output line length along with current indent.
600 ** If combined they overflow output line length, go ahead
601 ** and flush output up to the current wrap point.
602 */
603 static Bool CheckWrapLine( TidyDocImpl* doc )
604 {
605 TidyPrintImpl* pprint = &doc->pprint;
606 if ( GetSpaces(pprint) + pprint->linelen >= cfg(doc, TidyWrapLen) )
607 {
608 WrapLine( doc );
609 return yes;
610 }
611 return no;
612 }
613
614 static Bool CheckWrapIndent( TidyDocImpl* doc, uint indent )
615 {
616 TidyPrintImpl* pprint = &doc->pprint;
617 if ( GetSpaces(pprint) + pprint->linelen >= cfg(doc, TidyWrapLen) )
618 {
619 WrapLine( doc );
620 if ( pprint->indent[ 0 ].spaces < 0 )
621 pprint->indent[ 0 ].spaces = indent;
622 return yes;
623 }
624 return no;
625 }
626
627 static void WrapAttrVal( TidyDocImpl* doc )
628 {
629 TidyPrintImpl* pprint = &doc->pprint;
630 uint i;
631
632 /* assert( IsWrapInAttrVal(pprint) ); */
633 if ( WantIndent(doc) )
634 {
635 uint spaces = GetSpaces( pprint );
636 for ( i = 0; i < spaces; ++i )
637 WriteChar( ' ', doc->docOut );
638 }
639
640 for ( i = 0; i < pprint->wraphere; ++i )
641 WriteChar( pprint->linebuf[i], doc->docOut );
642
643 if ( IsWrapInString(pprint) )
644 WriteChar( '\\', doc->docOut );
645 else
646 WriteChar( ' ', doc->docOut );
647
648 WriteChar( '\n', doc->docOut );
649 ResetLineAfterWrap( pprint );
650 }
651
652 void PFlushLine( TidyDocImpl* doc, uint indent )
653 {
654 TidyPrintImpl* pprint = &doc->pprint;
655
656 if ( pprint->linelen > 0 )
657 {
658 uint i;
659
660 CheckWrapLine( doc );
661
662 if ( WantIndent(doc) )
663 {
664 uint spaces = GetSpaces( pprint );
665 for ( i = 0; i < spaces; ++i )
666 WriteChar( ' ', doc->docOut );
667 }
668
669 for ( i = 0; i < pprint->linelen; ++i )
670 WriteChar( pprint->linebuf[i], doc->docOut );
671
672 if ( IsInString(pprint) )
673 WriteChar( '\\', doc->docOut );
674 ResetLine( pprint );
675 pprint->linelen = 0;
676 }
677
678 WriteChar( '\n', doc->docOut );
679 pprint->indent[ 0 ].spaces = indent;
680 }
681
682 void PCondFlushLine( TidyDocImpl* doc, uint indent )
683 {
684 TidyPrintImpl* pprint = &doc->pprint;
685 if ( pprint->linelen > 0 )
686 {
687 uint i;
688
689 CheckWrapLine( doc );
690
691 if ( WantIndent(doc) )
692 {
693 uint spaces = GetSpaces( pprint );
694 for ( i = 0; i < spaces; ++i )
695 WriteChar(' ', doc->docOut);
696 }
697
698 for ( i = 0; i < pprint->linelen; ++i )
699 WriteChar( pprint->linebuf[i], doc->docOut );
700
701 if ( IsInString(pprint) )
702 WriteChar( '\\', doc->docOut );
703 ResetLine( pprint );
704
705 WriteChar( '\n', doc->docOut );
706 pprint->indent[ 0 ].spaces = indent;
707 pprint->linelen = 0;
708 }
709 }
710
711 static void PPrintChar( TidyDocImpl* doc, uint c, uint mode )
712 {
713 tmbchar entity[128];
714 ctmbstr p;
715 TidyPrintImpl* pprint = &doc->pprint;
716 uint outenc = cfg( doc, TidyOutCharEncoding );
717 Bool qmark = cfgBool( doc, TidyQuoteMarks );
718
719 if ( c == ' ' && !(mode & (PREFORMATTED | COMMENT | ATTRIBVALUE | CDATA)))
720 {
721 /* coerce a space character to a non-breaking space */
722 if (mode & NOWRAP)
723 {
724 ctmbstr ent = " ";
725 /* by default XML doesn't define */
726 if ( cfgBool(doc, TidyNumEntities) || cfgBool(doc, TidyXmlTags) )
727 ent = " ";
728 AddString( pprint, ent );
729 return;
730 }
731 else
732 pprint->wraphere = pprint->linelen;
733 }
734
735 /* comment characters are passed raw */
736 if ( mode & (COMMENT | CDATA) )
737 {
738 AddChar( pprint, c );
739 return;
740 }
741
742 /* except in CDATA map < to < etc. */
743 if ( !(mode & CDATA) )
744 {
745 if ( c == '<')
746 {
747 AddString( pprint, "<" );
748 return;
749 }
750
751 if ( c == '>')
752 {
753 AddString( pprint, ">" );
754 return;
755 }
756
757 /*
758 naked '&' chars can be left alone or
759 quoted as & The latter is required
760 for XML where naked '&' are illegal.
761 */
762 if ( c == '&' && cfgBool(doc, TidyQuoteAmpersand) )
763 {
764 AddString( pprint, "&" );
765 return;
766 }
767
768 if ( c == '"' && qmark )
769 {
770 AddString( pprint, """ );
771 return;
772 }
773
774 if ( c == '\'' && qmark )
775 {
776 AddString( pprint, "'" );
777 return;
778 }
779
780 if ( c == 160 && outenc != RAW )
781 {
782 if ( cfgBool(doc, TidyQuoteNbsp) )
783 {
784 if ( cfgBool(doc, TidyNumEntities) ||
785 cfgBool(doc, TidyXmlTags) )
786 AddString( pprint, " " );
787 else
788 AddString( pprint, " " );
789 }
790 else
791 AddChar( pprint, c );
792 return;
793 }
794 }
795
796 #if SUPPORT_ASIAN_ENCODINGS
797
798 /* #431953 - start RJ */
799 /* Handle encoding-specific issues */
800 switch ( outenc )
801 {
802 case UTF8:
803 #if SUPPORT_UTF16_ENCODINGS
804 case UTF16:
805 case UTF16LE:
806 case UTF16BE:
807 #endif
808 if (!(mode & PREFORMATTED) && cfg(doc, TidyPunctWrap))
809 {
810 WrapPoint wp = CharacterWrapPoint(c);
811 if (wp == WrapBefore)
812 pprint->wraphere = pprint->linelen;
813 else if (wp == WrapAfter)
814 pprint->wraphere = pprint->linelen + 1;
815 }
816 break;
817
818 case BIG5:
819 /* Allow linebreak at Chinese punctuation characters */
820 /* There are not many spaces in Chinese */
821 AddChar( pprint, c );
822 if (!(mode & PREFORMATTED) && cfg(doc, TidyPunctWrap))
823 {
824 WrapPoint wp = Big5WrapPoint(c);
825 if (wp == WrapBefore)
826 pprint->wraphere = pprint->linelen;
827 else if (wp == WrapAfter)
828 pprint->wraphere = pprint->linelen + 1;
829 }
830 return;
831
832 case SHIFTJIS:
833 #ifndef NO_NATIVE_ISO2022_SUPPORT
834 case ISO2022: /* ISO 2022 characters are passed raw */
835 #endif
836 case RAW:
837 AddChar( pprint, c );
838 return;
839 }
840 /* #431953 - end RJ */
841
842 #else /* SUPPORT_ASIAN_ENCODINGS */
843
844 /* otherwise ISO 2022 characters are passed raw */
845 if (
846 #ifndef NO_NATIVE_ISO2022_SUPPORT
847 outenc == ISO2022 ||
848 #endif
849 outenc == RAW )
850 {
851 AddChar( pprint, c );
852 return;
853 }
854
855 #endif /* SUPPORT_ASIAN_ENCODINGS */
856
857 /* don't map latin-1 chars to entities */
858 if ( outenc == LATIN1 )
859 {
860 if (c > 255) /* multi byte chars */
861 {
862 uint vers = HTMLVersion( doc );
863 if ( !cfgBool(doc, TidyNumEntities) && (p = EntityName(c, vers)) )
864 tmbsnprintf(entity, sizeof(entity), "&%s;", p);
865 else
866 tmbsnprintf(entity, sizeof(entity), "&#%u;", c);
867
868 AddString( pprint, entity );
869 return;
870 }
871
872 if (c > 126 && c < 160)
873 {
874 tmbsnprintf(entity, sizeof(entity), "&#%u;", c);
875 AddString( pprint, entity );
876 return;
877 }
878
879 AddChar( pprint, c );
880 return;
881 }
882
883 /* don't map UTF-8 chars to entities */
884 if ( outenc == UTF8 )
885 {
886 AddChar( pprint, c );
887 return;
888 }
889
890 #if SUPPORT_UTF16_ENCODINGS
891 /* don't map UTF-16 chars to entities */
892 if ( outenc == UTF16 || outenc == UTF16LE || outenc == UTF16BE )
893 {
894 AddChar( pprint, c );
895 return;
896 }
897 #endif
898
899 /* use numeric entities only for XML */
900 if ( cfgBool(doc, TidyXmlTags) )
901 {
902 /* if ASCII use numeric entities for chars > 127 */
903 if ( c > 127 && outenc == ASCII )
904 {
905 tmbsnprintf(entity, sizeof(entity), "&#%u;", c);
906 AddString( pprint, entity );
907 return;
908 }
909
910 /* otherwise output char raw */
911 AddChar( pprint, c );
912 return;
913 }
914
915 /* default treatment for ASCII */
916 if ( outenc == ASCII && (c > 126 || (c < ' ' && c != '\t')) )
917 {
918 uint vers = HTMLVersion( doc );
919 if (!cfgBool(doc, TidyNumEntities) && (p = EntityName(c, vers)) )
920 tmbsnprintf(entity, sizeof(entity), "&%s;", p);
921 else
922 tmbsnprintf(entity, sizeof(entity), "&#%u;", c);
923
924 AddString( pprint, entity );
925 return;
926 }
927
928 AddChar( pprint, c );
929 }
930
931 static uint IncrWS( uint start, uint end, uint indent, int ixWS )
932 {
933 if ( ixWS > 0 )
934 {
935 uint st = start + MIN( (uint)ixWS, indent );
936 start = MIN( st, end );
937 }
938 return start;
939 }
940 /*
941 The line buffer is uint not char so we can
942 hold Unicode values unencoded. The translation
943 to UTF-8 is deferred to the WriteChar() routine called
944 to flush the line buffer.
945 */
946 static void PPrintText( TidyDocImpl* doc, uint mode, uint indent,
947 Node* node )
948 {
949 uint start = node->start;
950 uint end = node->end;
951 uint ix, c = 0;
952 int ixNL = TextEndsWithNewline( doc->lexer, node, mode );
953 int ixWS = TextStartsWithWhitespace( doc->lexer, node, start, mode );
954 if ( ixNL > 0 )
955 end -= ixNL;
956 start = IncrWS( start, end, indent, ixWS );
957
958 for ( ix = start; ix < end; ++ix )
959 {
960 CheckWrapIndent( doc, indent );
961 /*
962 if ( CheckWrapIndent(doc, indent) )
963 {
964 ixWS = TextStartsWithWhitespace( doc->lexer, node, ix );
965 ix = IncrWS( ix, end, indent, ixWS );
966 }
967 */
968 c = (byte) doc->lexer->lexbuf[ix];
969
970 /* look for UTF-8 multibyte character */
971 if ( c > 0x7F )
972 ix += GetUTF8( doc->lexer->lexbuf + ix, &c );
973
974 if ( c == '\n' )
975 {
976 PFlushLine( doc, indent );
977 ixWS = TextStartsWithWhitespace( doc->lexer, node, ix+1, mode );
978 ix = IncrWS( ix, end, indent, ixWS );
979 }
980 else
981 {
982 PPrintChar( doc, c, mode );
983 }
984 }
985 }
986
987 #if 0
988 static void PPrintString( TidyDocImpl* doc, uint indent, ctmbstr str )
989 {
990 while ( *str != '\0' )
991 AddChar( &doc->pprint, *str++ );
992 }
993 #endif /* 0 */
994
995
996 static void PPrintAttrValue( TidyDocImpl* doc, uint indent,
997 ctmbstr value, uint delim, Bool wrappable, Bool scriptAttr )
998 {
999 TidyPrintImpl* pprint = &doc->pprint;
1000 Bool scriptlets = cfgBool(doc, TidyWrapScriptlets);
1001
1002 int mode = PREFORMATTED | ATTRIBVALUE;
1003 if ( wrappable )
1004 mode = NORMAL | ATTRIBVALUE;
1005
1006 /* look for ASP, Tango or PHP instructions for computed attribute value */
1007 if ( value && value[0] == '<' )
1008 {
1009 if ( value[1] == '%' || value[1] == '@'||
1010 tmbstrncmp(value, "<?php", 5) == 0 )
1011 mode |= CDATA;
1012 }
1013
1014 if ( delim == 0 )
1015 delim = '"';
1016
1017 AddChar( pprint, '=' );
1018
1019 /* don't wrap after "=" for xml documents */
1020 if ( !cfgBool(doc, TidyXmlOut) || cfgBool(doc, TidyXhtmlOut) )
1021 {
1022 SetWrap( doc, indent );
1023 CheckWrapIndent( doc, indent );
1024 /*
1025 if ( !SetWrap(doc, indent) )
1026 PCondFlushLine( doc, indent );
1027 */
1028 }
1029
1030 AddChar( pprint, delim );
1031
1032 if ( value )
1033 {
1034 uint wraplen = cfg( doc, TidyWrapLen );
1035 int attrStart = SetInAttrVal( pprint );
1036 int strStart = ClearInString( pprint );
1037
1038 while (*value != '\0')
1039 {
1040 uint c = *value;
1041
1042 if ( wrappable && c == ' ' )
1043 SetWrapAttr( doc, indent, attrStart, strStart );
1044
1045 if ( wrappable && pprint->wraphere > 0 &&
1046 GetSpaces(pprint) + pprint->linelen >= wraplen )
1047 WrapAttrVal( doc );
1048
1049 if ( c == delim )
1050 {
1051 ctmbstr entity = (c == '"' ? """ : "'");
1052 AddString( pprint, entity );
1053 ++value;
1054 continue;
1055 }
1056 else if (c == '"')
1057 {
1058 if ( cfgBool(doc, TidyQuoteMarks) )
1059 AddString( pprint, """ );
1060 else
1061 AddChar( pprint, c );
1062
1063 if ( delim == '\'' && scriptAttr && scriptlets )
1064 strStart = ToggleInString( pprint );
1065
1066 ++value;
1067 continue;
1068 }
1069 else if ( c == '\'' )
1070 {
1071 if ( cfgBool(doc, TidyQuoteMarks) )
1072 AddString( pprint, "'" );
1073 else
1074 AddChar( pprint, c );
1075
1076 if ( delim == '"' && scriptAttr && scriptlets )
1077 strStart = ToggleInString( pprint );
1078
1079 ++value;
1080 continue;
1081 }
1082
1083 /* look for UTF-8 multibyte character */
1084 if ( c > 0x7F )
1085 value += GetUTF8( value, &c );
1086 ++value;
1087
1088 if ( c == '\n' )
1089 {
1090 /* No indent inside Javascript literals */
1091 PFlushLine( doc, (strStart < 0 ? indent : 0) );
1092 continue;
1093 }
1094 PPrintChar( doc, c, mode );
1095 }
1096 ClearInAttrVal( pprint );
1097 ClearInString( pprint );
1098 }
1099 AddChar( pprint, delim );
1100 }
1101
1102 static uint AttrIndent( TidyDocImpl* doc, Node* node, AttVal* ARG_UNUSED(attr) )
1103 {
1104 uint spaces = cfg( doc, TidyIndentSpaces );
1105 uint xtra = 2; /* 1 for the '<', another for the ' ' */
1106 if ( node->element == NULL )
1107 return spaces;
1108
1109 if ( !nodeHasCM(node, CM_INLINE) ||
1110 !ShouldIndent(doc, node->parent ? node->parent: node) )
1111 return xtra + tmbstrlen( node->element );
1112
1113 if ( NULL != (node = FindContainer(node)) )
1114 return xtra + tmbstrlen( node->element );
1115 return spaces;
1116 }
1117
1118 static Bool AttrNoIndentFirst( /*TidyDocImpl* doc,*/ Node* node, AttVal* attr )
1119 {
1120 return ( attr==node->attributes );
1121
1122 /*&&
1123 ( InsideHead(doc, node) ||
1124 !nodeHasCM(node, CM_INLINE) ) );
1125 */
1126 }
1127
1128 static void PPrintAttribute( TidyDocImpl* doc, uint indent,
1129 Node *node, AttVal *attr )
1130 {
1131 TidyPrintImpl* pprint = &doc->pprint;
1132 Bool xmlOut = cfgBool( doc, TidyXmlOut );
1133 Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
1134 Bool wrapAttrs = cfgBool( doc, TidyWrapAttVals );
1135 Bool ucAttrs = cfgBool( doc, TidyUpperCaseAttrs );
1136 Bool indAttrs = cfgBool( doc, TidyIndentAttributes );
1137 uint xtra = AttrIndent( doc, node, attr );
1138 Bool first = AttrNoIndentFirst( /*doc,*/ node, attr );
1139 tmbstr name = attr->attribute;
1140 Bool wrappable = no;
1141 tchar c;
1142
1143 /* fix for odd attribute indentation bug triggered by long values */
1144 if (!indAttrs)
1145 xtra = 0;
1146
1147 if ( indAttrs )
1148 {
1149 if ( nodeIsElement(node) && !first )
1150 {
1151 indent += xtra;
1152 PCondFlushLine( doc, indent );
1153 }
1154 else
1155 indAttrs = no;
1156 }
1157
1158 CheckWrapIndent( doc, indent );
1159
1160 if ( !xmlOut && !xhtmlOut && attr->dict )
1161 {
1162 if ( IsScript(doc, name) )
1163 wrappable = cfgBool( doc, TidyWrapScriptlets );
1164 else if (!(attrIsCONTENT(attr) || attrIsVALUE(attr) || attrIsALT(attr)) && wrapAttrs )
1165 wrappable = yes;
1166 }
1167
1168 if ( !first && !SetWrap(doc, indent) )
1169 {
1170 PFlushLine( doc, indent+xtra ); /* Put it on next line */
1171 }
1172 else if ( pprint->linelen > 0 )
1173 {
1174 AddChar( pprint, ' ' );
1175 }
1176
1177 /* Attribute name */
1178 while (*name)
1179 {
1180 c = (unsigned char)*name;
1181
1182 if (c > 0x7F)
1183 name += GetUTF8(name, &c);
1184 else if (ucAttrs)
1185 c = ToUpper(c);
1186
1187 AddChar(pprint, c);
1188 ++name;
1189 }
1190
1191 /* fix for bug 732038 */
1192 #if 0
1193 /* If not indenting attributes, bump up indent for
1194 ** value after putting out name.
1195 */
1196 if ( !indAttrs )
1197 indent += xtra;
1198 #endif
1199
1200 CheckWrapIndent( doc, indent );
1201
1202 if ( attr->value == NULL )
1203 {
1204 Bool isB = IsBoolAttribute(attr);
1205 Bool scriptAttr = attrIsEvent(attr);
1206
1207 if ( xmlOut )
1208 PPrintAttrValue( doc, indent, isB ? attr->attribute : NULLSTR,
1209 attr->delim, no, scriptAttr );
1210
1211 else if ( !isB && !IsNewNode(node) )
1212 PPrintAttrValue( doc, indent, "", attr->delim, yes, scriptAttr );
1213
1214 else
1215 SetWrap( doc, indent );
1216 }
1217 else
1218 PPrintAttrValue( doc, indent, attr->value, attr->delim, wrappable, no );
1219 }
1220
1221 static void