gloox  1.0
parser.cpp
1 /*
2  Copyright (c) 2004-2009 by Jakob Schroeter <js@camaya.net>
3  This file is part of the gloox library. http://camaya.net/gloox
4 
5  This software is distributed under a license. The full license
6  agreement can be found in the file LICENSE in this distribution.
7  This software may not be copied, modified, sold or distributed
8  other than expressed in the named license agreement.
9 
10  This software is distributed without any warranty.
11 */
12 
13 #include "gloox.h"
14 #include "util.h"
15 #include "parser.h"
16 
17 #include <cstdlib>
18 
19 namespace gloox
20 {
21 
22  Parser::Parser( TagHandler* ph, bool deleteRoot )
23  : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
24  m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
25  m_attribIsXmlns( false ), m_deleteRoot( deleteRoot )
26  {
27  }
28 
30  {
31  delete m_root;
32  delete m_xmlnss;
33  }
34 
35  Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
36  {
37  std::string::size_type p = data.find( ';', pos );
38  std::string::size_type diff = p - pos;
39 
40  if( p == std::string::npos )
41  {
42  m_backBuffer = data.substr( pos );
43  return DecodeInsufficient;
44  }
45 
46  if( diff < 3 || diff > 9 )
47  return DecodeInvalid;
48 
49  std::string rep;
50  switch( data[pos + 1] )
51  {
52  case '#':
53  {
54  int base = 10;
55  int idx = 2;
56 
57  if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
58  {
59  base = 16;
60  idx = 3;
61  }
62 
63  char* end;
64  const long int val = std::strtol( data.data() + pos + idx, &end, base );
65  if( *end != ';' || val < 0 )
66  return DecodeInvalid;
67 
68  if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
69  {
70  rep += char( val );
71  }
72  else if( val >= 0x80 && val <= 0x7FF )
73  {
74  rep += char( 192 + ( val >> 6 ) );
75  rep += char( 128 + ( val % 64 ) );
76  }
77  else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
78  {
79  rep += char( 224 + ( val >> 12 ) );
80  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
81  rep += char( 128 + ( val % 64 ) );
82  }
83  else if( val >= 0x100000 && val < 0x10FFFF )
84  {
85  rep += char( 240 + ( val >> 18 ) );
86  rep += char( 128 + ( ( val >> 12 ) % 64 ) );
87  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
88  rep += char( 128 + ( val % 64 ) );
89  }
90  else
91  return DecodeInvalid;
92  }
93  break;
94  case 'l':
95  if( diff == 3 && data[pos + 2] == 't' )
96  rep += '<';
97  else
98  return DecodeInvalid;
99  break;
100  case 'g':
101  if( diff == 3 && data[pos + 2] == 't' )
102  rep += '>';
103  else
104  return DecodeInvalid;
105  break;
106  case 'a':
107  if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
108  rep += '\'';
109  else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
110  rep += '&';
111  else
112  return DecodeInvalid;
113  break;
114  case 'q':
115  if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
116  rep += '"';
117  else
118  return DecodeInvalid;
119  break;
120  default:
121  return DecodeInvalid;
122  }
123 
124  switch( m_state )
125  {
126  case TagInside:
127  m_cdata += rep;
128  break;
129  case TagAttributeValue:
130  m_value += rep;
131  break;
132  default:
133  break;
134  }
135  pos += diff;
136  return DecodeValid;
137  }
138 
139  Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
140  const std::string& needle )
141  {
142  if( pos + needle.length() <= data.length() )
143  {
144  if( !data.compare( pos, needle.length(), needle ) )
145  {
146  pos += needle.length() - 1;
147  return ForwardFound;
148  }
149  else
150  {
151  return ForwardNotFound;
152  }
153  }
154  else
155  {
156  m_backBuffer = data.substr( pos );
157  return ForwardInsufficientSize;
158  }
159  }
160 
161  int Parser::feed( std::string& data )
162  {
163  if( !m_backBuffer.empty() )
164  {
165  data.insert( 0, m_backBuffer );
166  m_backBuffer = EmptyString;
167  }
168 
169  std::string::size_type count = data.length();
170  for( std::string::size_type i = 0; i < count; ++i )
171  {
172  const unsigned char c = data[i];
173 // printf( "found char: %c, ", c );
174 
175  if( !isValid( c ) )
176  {
177  cleanup();
178  return static_cast<int>( i );
179  }
180 
181  switch( m_state )
182  {
183  case Initial:
184 // printf( "Initial: %c\n", c );
185  if( isWhitespace( c ) )
186  break;
187 
188  switch( c )
189  {
190  case '<':
191  m_state = TagOpening;
192  break;
193  default:
194  cleanup();
195  return static_cast<int>( i );
196  break;
197  }
198  break;
199  case InterTag:
200 // printf( "InterTag: %c\n", c );
201  m_tag = EmptyString;
202  if( isWhitespace( c ) )
203  break;
204 
205  switch( c )
206  {
207  case '<':
208  m_state = TagOpening;
209  break;
210  case '>':
211  default:
212  if( m_current )
213  {
214  m_cdata += c;
215  m_state = TagInside;
216  }
217  break;
218  }
219  break;
220  case TagOpening: // opening '<' has been found before
221 // printf( "TagOpening: %c\n", c );
222  if( isWhitespace( c ) )
223  break;
224 
225  switch( c )
226  {
227  case '<':
228  case '>':
229  case '&':
230  cleanup();
231  return static_cast<int>( i );
232  break;
233  case '/':
234  m_state = TagClosingSlash;
235  break;
236  case '?':
237  m_state = TagNameCollect;
238  m_preamble = 1;
239  break;
240  case '!':
241  switch( forwardScan( i, data, "![CDATA[" ) )
242  {
243  case ForwardFound:
244  m_state = TagCDATASection;
245  break;
246  case ForwardNotFound:
247  cleanup();
248  return static_cast<int>( i );
249  case ForwardInsufficientSize:
250  return -1;
251  }
252  break;
253  default:
254  m_tag += c;
255  m_state = TagNameCollect;
256  break;
257  }
258  break;
259  case TagCDATASection:
260  switch( c )
261  {
262  case ']':
263  switch( forwardScan( i, data, "]]>" ) )
264  {
265  case ForwardFound:
266  m_state = TagInside;
267  break;
268  case ForwardNotFound:
269  m_cdata += c;
270  break;
271  case ForwardInsufficientSize:
272  return -1;
273  }
274  break;
275  default:
276  m_cdata += c;
277  break;
278  }
279  break;
280  case TagNameCollect: // we're collecting the tag's name, we have at least one octet already
281 // printf( "TagNameCollect: %c\n", c );
282  if( isWhitespace( c ) )
283  {
284  m_state = TagNameComplete;
285  break;
286  }
287 
288  switch( c )
289  {
290  case '<':
291  case '?':
292  case '!':
293  case '&':
294  cleanup();
295  return static_cast<int>( i );
296  break;
297  case '/':
298  m_state = TagOpeningSlash;
299  break;
300  case '>':
301  addTag();
302  m_state = TagInside;
303  break;
304  case ':':
305  if( !m_haveTagPrefix )
306  {
307  m_haveTagPrefix = true;
308  m_tagPrefix = m_tag;
309  m_tag = EmptyString;
310  }
311  else
312  {
313  cleanup();
314  return static_cast<int>( i );
315  }
316  break;
317  default:
318  m_tag += c;
319  break;
320  }
321  break;
322  case TagInside: // we're inside a tag, expecting a child tag or cdata
323 // printf( "TagInside: %c\n", c );
324  m_tag = EmptyString;
325  switch( c )
326  {
327  case '<':
328  addCData();
329  m_state = TagOpening;
330  break;
331  case '&':
332 // printf( "TagInside, calling decode\n" );
333  switch( decode( i, data ) )
334  {
335  case DecodeValid:
336  break;
337  case DecodeInvalid:
338  cleanup();
339  return static_cast<int>( i );
340  case DecodeInsufficient:
341  return -1;
342  }
343  break;
344  default:
345  m_cdata += c;
346  break;
347  }
348  break;
349  case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag
350 // printf( "TagOpeningSlash: %c\n", c );
351  if( isWhitespace( c ) )
352  break;
353 
354  if( c == '>' )
355  {
356  addTag();
357  if( !closeTag() )
358  {
359 // printf( "noipe, here\n" );
360  cleanup();
361  return static_cast<int>( i );
362  }
363 
364  m_state = InterTag;
365  }
366  else
367  {
368  cleanup();
369  return static_cast<int>( i );
370  }
371  break;
372  case TagClosingSlash: // we have found the '/' of a closing tag
373 // printf( "TagClosingSlash: %c\n", c );
374  if( isWhitespace( c ) )
375  break;
376 
377  switch( c )
378  {
379  case '>':
380  case '<':
381  case '/':
382  cleanup();
383  return static_cast<int>( i );
384  break;
385  default:
386  m_tag += c;
387  m_state = TagClosing;
388  break;
389  }
390  break;
391  case TagClosing: // we're collecting the name of a closing tag
392 // printf( "TagClosing: %c\n", c );
393  switch( c )
394  {
395  case '<':
396  case '/':
397  case '!':
398  case '?':
399  case '&':
400  cleanup();
401  return static_cast<int>( i );
402  break;
403  case ':':
404  if( !m_haveTagPrefix )
405  {
406  m_haveTagPrefix = true;
407  m_tagPrefix = m_tag;
408  m_tag = EmptyString;
409  }
410  else
411  {
412  cleanup();
413  return static_cast<int>( i );
414  }
415  break;
416  case '>':
417  if( !closeTag() )
418  {
419 // printf( "here\n" );
420  cleanup();
421  return static_cast<int>( i );
422  }
423  m_state = InterTag;
424  break;
425  default:
426  m_tag += c;
427  break;
428  }
429  break;
430  case TagNameComplete: // a tag name is complete, expect tag close or attribs
431 // printf( "TagNameComplete: %c\n", c );
432  if( isWhitespace( c ) )
433  break;
434 
435  switch( c )
436  {
437  case '<':
438  case '!':
439  case '&':
440  cleanup();
441  return static_cast<int>( i );
442  break;
443  case '/':
444  m_state = TagOpeningSlash;
445  break;
446  case '>':
447  if( m_preamble == 1 )
448  {
449  cleanup();
450  return static_cast<int>( i );
451  }
452  m_state = TagInside;
453  addTag();
454  break;
455  case '?':
456  if( m_preamble == 1 )
457  m_preamble = 2;
458  else
459  {
460  cleanup();
461  return static_cast<int>( i );
462  }
463  break;
464  default:
465  m_attrib += c;
466  m_state = TagAttribute;
467  break;
468  }
469  break;
470  case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet
471 // printf( "TagAttribute: %c\n", c );
472  if( isWhitespace( c ) )
473  {
474  m_state = TagAttributeComplete;
475  break;
476  }
477 
478  switch( c )
479  {
480  case '<':
481  case '/':
482  case '>':
483  case '?':
484  case '!':
485  case '&':
486  cleanup();
487  return static_cast<int>( i );
488  break;
489  case '=':
490  m_state = TagAttributeEqual;
491  break;
492  case ':':
493  if( !m_haveAttribPrefix && m_attrib != XMLNS )
494  {
495  m_haveAttribPrefix = true;
496  m_attribPrefix = m_attrib;
497  m_attrib = EmptyString;
498  }
499  else if( m_attrib == XMLNS )
500  {
501  m_attribIsXmlns = true;
502  m_attrib = EmptyString;
503  }
504  else
505  {
506  cleanup();
507  return static_cast<int>( i );
508  }
509  break;
510  default:
511  m_attrib += c;
512  }
513  break;
514  case TagAttributeComplete: // we're expecting an equals sign or ws
515 // printf( "TagAttributeComplete: %c\n", c );
516  if( isWhitespace( c ) )
517  break;
518 
519  switch( c )
520  {
521  case '=':
522  m_state = TagAttributeEqual;
523  break;
524  default:
525  cleanup();
526  return static_cast<int>( i );
527  break;
528  }
529  break;
530  case TagAttributeEqual: // we have found an equals sign
531 // printf( "TagAttributeEqual: %c\n", c );
532  if( isWhitespace( c ) )
533  break;
534 
535  switch( c )
536  {
537  case '"':
538  m_quote = true;
539  case '\'':
540  m_state = TagAttributeValue;
541  break;
542  default:
543  cleanup();
544  return static_cast<int>( i );
545  break;
546  }
547  break;
548  case TagAttributeValue: // we're expecting value data
549 // printf( "TagValue: %c\n", c );
550  switch( c )
551  {
552  case '<':
553  cleanup();
554  return static_cast<int>( i );
555  break;
556  case '\'':
557  if( m_quote )
558  {
559  m_value += c;
560  break;
561  }
562  case '"':
563  addAttribute();
564  m_state = TagNameAlmostComplete;
565  m_quote = false;
566  break;
567  case '&':
568 // printf( "TagAttributeValue, calling decode\n" );
569  switch( decode( i, data ) )
570  {
571  case DecodeValid:
572  break;
573  case DecodeInvalid:
574  cleanup();
575  return static_cast<int>( i );
576  case DecodeInsufficient:
577  return -1;
578  }
579  break;
580  case '>':
581  default:
582  m_value += c;
583  }
584  break;
585  case TagNameAlmostComplete:
586 // printf( "TagAttributeEqual: %c\n", c );
587  if( isWhitespace( c ) )
588  {
589  m_state = TagNameComplete;
590  break;
591  }
592 
593  switch( c )
594  {
595  case '/':
596  m_state = TagOpeningSlash;
597  break;
598  case '>':
599  if( m_preamble == 1 )
600  {
601  cleanup();
602  return static_cast<int>( i );
603  }
604  m_state = TagInside;
605  addTag();
606  break;
607  case '?':
608  if( m_preamble == 1 )
609  m_preamble = 2;
610  else
611  {
612  cleanup();
613  return static_cast<int>( i );
614  }
615  break;
616  default:
617  cleanup();
618  return static_cast<int>( i );
619  break;
620  }
621  break;
622  default:
623 // printf( "default action!?\n" );
624  break;
625  }
626 // printf( "parser state: %d\n", m_state );
627  }
628 
629  return -1;
630  }
631 
632  void Parser::addTag()
633  {
634  if( !m_root )
635  {
636 // printf( "created Tag named %s, ", m_tag.c_str() );
637  m_root = new Tag( m_tag );
638  m_current = m_root;
639  }
640  else
641  {
642 // printf( "created Tag named %s, ", m_tag.c_str() );
643  m_current = new Tag( m_current, m_tag );
644  }
645 
646  if( m_haveTagPrefix )
647  {
648 // printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
649  m_current->setPrefix( m_tagPrefix );
650  m_haveTagPrefix = false;
651  }
652 
653  if( m_attribs.size() )
654  {
655  m_current->setAttributes( m_attribs );
656 // printf( "added %d attributes, ", m_attribs.size() );
657  m_attribs.clear();
658  }
659 
660  if( m_xmlnss )
661  {
662 // printf( "have ns decls\n" );
663 // StringMap::const_iterator it = m_xmlnss->begin();
664 // for( ; it != m_xmlnss->end(); ++it )
665 // printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
666  m_current->setXmlns( m_xmlnss );
667  m_xmlnss = 0;
668  }
669 
670  m_current->setXmlns( m_xmlns );
671  m_xmlns = EmptyString;
672 
673  if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
674  {
675  streamEvent( m_root );
676  cleanup( m_deleteRoot );
677  return;
678  }
679 // else
680 // printf( "%s, ", m_root->xml().c_str() );
681 
682  if( m_root && m_root == m_current && m_tagPrefix == "stream" )
683  m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
684 
685  if( m_tag == "xml" && m_preamble == 2 )
686  cleanup();
687  }
688 
689  void Parser::addAttribute()
690  {
691  Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
692  if( m_attribIsXmlns )
693  {
694  if( !m_xmlnss )
695  m_xmlnss = new StringMap();
696 
697  (*m_xmlnss)[m_attrib] = m_value;
698  attr->setPrefix( XMLNS );
699  }
700  else
701  {
702 // printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
703  if( !m_attribPrefix.empty() )
704  attr->setPrefix( m_attribPrefix );
705  if( m_attrib == XMLNS )
706  m_xmlns = m_value;
707  }
708  m_attribs.push_back( attr );
709  m_attrib = EmptyString;
710  m_value = EmptyString;
711  m_attribPrefix = EmptyString;
712  m_haveAttribPrefix = false;
713  m_attribIsXmlns = false;
714  }
715 
716  void Parser::addCData()
717  {
718  if( m_current && !m_cdata.empty() )
719  {
720  m_current->addCData( m_cdata );
721 // printf( "added cdata %s to %s: %s\n",
722 // m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
723  m_cdata = EmptyString;
724  }
725  }
726 
727  bool Parser::closeTag()
728  {
729 // printf( "about to close, " );
730 
731  if( m_tag == "stream" && m_tagPrefix == "stream" )
732  return true;
733 
734  if( !m_current || m_current->name() != m_tag
735  || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
736  {
737 // printf( "current xml: %s\n", m_current->xml().c_str() );
738 // printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
739 // printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
740  return false;
741  }
742 
743 // printf( "m_current: %s, ", m_current->name().c_str() );
744 // printf( "m_tag: %s, ", m_tag.c_str() );
745 
746  m_tagPrefix = EmptyString;
747  m_haveTagPrefix = false;
748 
749  if( m_current->parent() )
750  m_current = m_current->parent();
751  else
752  {
753 // printf( "pushing upstream\n" );
754  streamEvent( m_root );
755  cleanup( m_deleteRoot );
756  }
757 
758  return true;
759  }
760 
761  void Parser::cleanup( bool deleteRoot )
762  {
763  if( deleteRoot )
764  delete m_root;
765  m_root = 0;
766  m_current = 0;
767  delete m_xmlnss;
768  m_xmlnss = 0;
769  m_cdata = EmptyString;
770  m_tag = EmptyString;
771  m_attrib = EmptyString;
772  m_attribPrefix = EmptyString;
773  m_tagPrefix = EmptyString;
774  m_haveAttribPrefix = false;
775  m_haveTagPrefix = false;
776  m_value = EmptyString;
777  m_xmlns = EmptyString;
778  util::clearList( m_attribs );
779  m_attribs.clear();
780  m_state = Initial;
781  m_preamble = 0;
782  }
783 
784  bool Parser::isValid( unsigned char c )
785  {
786  return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
787  }
788 
789  bool Parser::isWhitespace( unsigned char c )
790  {
791  return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
792  }
793 
794  void Parser::streamEvent( Tag* tag )
795  {
796  if( m_tagHandler )
797  m_tagHandler->handleTag( tag );
798  }
799 
800 }