gloox  1.1-svn
parser.cpp
1 /*
2  Copyright (c) 2004-2009 by Jakob Schroeter <js@camaya.net>
3  This file is part of the gloox library. http://camaya.net/gloox
4 
5  This software is distributed under a license. The full license
6  agreement can be found in the file LICENSE in this distribution.
7  This software may not be copied, modified, sold or distributed
8  other than expressed in the named license agreement.
9 
10  This software is distributed without any warranty.
11 */
12 
13 #include "gloox.h"
14 #include "util.h"
15 #include "parser.h"
16 
17 #include <cstdlib>
18 
19 namespace gloox
20 {
21 
22  Parser::Parser( TagHandler* ph, bool deleteRoot )
23  : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
24  m_preamble( 0 ), m_return( ParseIncomplete ), m_quote( false ), m_haveTagPrefix( false ),
25  m_haveAttribPrefix( false ), m_attribIsXmlns( false ), m_deleteRoot( deleteRoot ),
26  m_nullRoot( true )
27  {
28  }
29 
31  {
32  delete m_root;
33  delete m_xmlnss;
34  }
35 
36  Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
37  {
38  std::string::size_type p = data.find( ';', pos );
39  std::string::size_type diff = p - pos;
40 
41  if( p == std::string::npos )
42  {
43  m_backBuffer = data.substr( pos );
44  return DecodeInsufficient;
45  }
46 
47  if( diff < 3 || diff > 9 )
48  return DecodeInvalid;
49 
50  std::string rep;
51  switch( data[pos + 1] )
52  {
53  case '#':
54  {
55  int base = 10;
56  int idx = 2;
57 
58  if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
59  {
60  base = 16;
61  idx = 3;
62  }
63 
64  char* end;
65  const long int val = std::strtol( data.data() + pos + idx, &end, base );
66  if( *end != ';' || val < 0 )
67  return DecodeInvalid;
68 
69  if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
70  {
71  rep += char( val );
72  }
73  else if( val >= 0x80 && val <= 0x7FF )
74  {
75  rep += char( 192 + ( val >> 6 ) );
76  rep += char( 128 + ( val % 64 ) );
77  }
78  else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
79  {
80  rep += char( 224 + ( val >> 12 ) );
81  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
82  rep += char( 128 + ( val % 64 ) );
83  }
84  else if( val >= 0x100000 && val < 0x10FFFF )
85  {
86  rep += char( 240 + ( val >> 18 ) );
87  rep += char( 128 + ( ( val >> 12 ) % 64 ) );
88  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
89  rep += char( 128 + ( val % 64 ) );
90  }
91  else
92  return DecodeInvalid;
93  }
94  break;
95  case 'l':
96  if( diff == 3 && data[pos + 2] == 't' )
97  rep += '<';
98  else
99  return DecodeInvalid;
100  break;
101  case 'g':
102  if( diff == 3 && data[pos + 2] == 't' )
103  rep += '>';
104  else
105  return DecodeInvalid;
106  break;
107  case 'a':
108  if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
109  rep += '\'';
110  else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
111  rep += '&';
112  else
113  return DecodeInvalid;
114  break;
115  case 'q':
116  if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
117  rep += '"';
118  else
119  return DecodeInvalid;
120  break;
121  default:
122  return DecodeInvalid;
123  }
124 
125  switch( m_state )
126  {
127  case TagInside:
128  m_cdata += rep;
129  break;
130  case TagAttributeValue:
131  m_value += rep;
132  break;
133  default:
134  break;
135  }
136  pos += diff;
137  return DecodeValid;
138  }
139 
140  Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
141  const std::string& needle )
142  {
143  if( pos + needle.length() <= data.length() )
144  {
145  if( !data.compare( pos, needle.length(), needle ) )
146  {
147  pos += needle.length() - 1;
148  return ForwardFound;
149  }
150  else
151  {
152  return ForwardNotFound;
153  }
154  }
155  else
156  {
157  m_backBuffer = data.substr( pos );
158  return ForwardInsufficientSize;
159  }
160  }
161 
162  int Parser::feed( std::string& data )
163  {
164  if( !m_backBuffer.empty() )
165  {
166  data.insert( 0, m_backBuffer );
167  m_backBuffer = EmptyString;
168  }
169 
170  std::string::size_type count = data.length();
171  for( std::string::size_type i = 0; i < count; ++i )
172  {
173  const unsigned char c = data[i];
174 // printf( "found char: %c, ", c );
175 
176  if( !isValid( c ) )
177  {
178  cleanup();
179  return static_cast<int>( i );
180  }
181 
182  switch( m_state )
183  {
184  case Initial:
185 // printf( "Initial: %c\n", c );
186  if( isWhitespace( c ) )
187  break;
188 
189  switch( c )
190  {
191  case '<':
192  m_state = TagOpening;
193  break;
194  default:
195  cleanup();
196  return static_cast<int>( i );
197  break;
198  }
199  break;
200  case InterTag:
201 // printf( "InterTag: %c\n", c );
202  m_tag = EmptyString;
203  if( isWhitespace( c ) )
204  break;
205 
206  switch( c )
207  {
208  case '<':
209  m_state = TagOpening;
210  break;
211  case '>':
212  default:
213  if( m_current )
214  {
215  m_cdata += c;
216  m_state = TagInside;
217  }
218  break;
219  }
220  break;
221  case TagOpening: // opening '<' has been found before
222 // printf( "TagOpening: %c\n", c );
223  if( isWhitespace( c ) )
224  break;
225 
226  switch( c )
227  {
228  case '<':
229  case '>':
230  case '&':
231  cleanup();
232  return static_cast<int>( i );
233  break;
234  case '/':
235  m_state = TagClosingSlash;
236  break;
237  case '?':
238  m_state = TagNameCollect;
239  m_preamble = 1;
240  break;
241  case '!':
242  if( i + 1 >= data.length() )
243  return -1;
244 
245  switch( data[i + 1] )
246  {
247  case '[':
248  switch( forwardScan( i, data, "![CDATA[" ) )
249  {
250  case ForwardFound:
251  m_state = TagCDATASection;
252  break;
253  case ForwardNotFound:
254  cleanup();
255  return i;
256  break;
257  case ForwardInsufficientSize:
258  return -1;
259  }
260  break;
261  case '-':
262  switch( forwardScan( i, data, "!-- " ) )
263  {
264  case ForwardFound:
265  m_state = XMLComment;
266  break;
267  case ForwardNotFound:
268  cleanup();
269  return i;
270  break;
271  case ForwardInsufficientSize:
272  return -1;
273  }
274  break;
275  default:
276  cleanup();
277  return static_cast<int>( i );
278  break;
279  }
280  break;
281  default:
282  m_tag += c;
283  m_state = TagNameCollect;
284  break;
285  }
286  break;
287  case TagCDATASection:
288  switch( c )
289  {
290  case ']':
291  switch( forwardScan( i, data, "]]>" ) )
292  {
293  case ForwardFound:
294  m_state = TagInside;
295  break;
296  case ForwardNotFound:
297  m_cdata += c;
298  break;
299  case ForwardInsufficientSize:
300  return -1;
301  }
302  break;
303  default:
304  m_cdata += c;
305  break;
306  }
307  break;
308  case XMLComment: // we're inside an XMLcomment.
309  if( c == ' ' )
310  {
311  switch( forwardScan( i, data, " -->" ) )
312  {
313  case ForwardFound:
314  m_state = InterTag;
315  break;
316  case ForwardNotFound:
317  break;
318  case ForwardInsufficientSize:
319  return -1;
320  }
321  }
322  break;
323  case TagNameCollect: // we're collecting the tag's name, we have at least one octet already
324 // printf( "TagNameCollect: %c\n", c );
325  if( isWhitespace( c ) )
326  {
327  m_state = TagNameComplete;
328  break;
329  }
330 
331  switch( c )
332  {
333  case '<':
334  case '?':
335  case '!':
336  case '&':
337  cleanup();
338  return static_cast<int>( i );
339  break;
340  case '/':
341  m_state = TagOpeningSlash;
342  break;
343  case '>':
344  addTag();
345  m_state = TagInside;
346  break;
347  case ':':
348  if( !m_haveTagPrefix )
349  {
350  m_haveTagPrefix = true;
351  m_tagPrefix = m_tag;
352  m_tag = EmptyString;
353  }
354  else
355  {
356  cleanup();
357  return static_cast<int>( i );
358  }
359  break;
360  default:
361  m_tag += c;
362  break;
363  }
364  break;
365  case TagInside: // we're inside a tag, expecting a child tag or cdata
366 // printf( "TagInside: %c\n", c );
367  m_tag = EmptyString;
368  switch( c )
369  {
370  case '<':
371  addCData();
372  m_state = TagOpening;
373  break;
374  case '&':
375 // printf( "TagInside, calling decode\n" );
376  switch( decode( i, data ) )
377  {
378  case DecodeValid:
379  break;
380  case DecodeInvalid:
381  cleanup();
382  return static_cast<int>( i );
383  case DecodeInsufficient:
384  return -1;
385  }
386  break;
387  default:
388  m_cdata += c;
389  break;
390  }
391  break;
392  case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag
393 // printf( "TagOpeningSlash: %c\n", c );
394  if( isWhitespace( c ) )
395  break;
396 
397  if( c == '>' )
398  {
399  addTag();
400  if( !closeTag() )
401  {
402 // printf( "noipe, here\n" );
403  cleanup();
404  return static_cast<int>( i );
405  }
406 
407  m_state = InterTag;
408  }
409  else
410  {
411  cleanup();
412  return static_cast<int>( i );
413  }
414  break;
415  case TagClosingSlash: // we have found the '/' of a closing tag
416 // printf( "TagClosingSlash: %c\n", c );
417  if( isWhitespace( c ) )
418  break;
419 
420  switch( c )
421  {
422  case '>':
423  case '<':
424  case '/':
425  cleanup();
426  return static_cast<int>( i );
427  break;
428  default:
429  m_tag += c;
430  m_state = TagClosing;
431  break;
432  }
433  break;
434  case TagClosing: // we're collecting the name of a closing tag
435 // printf( "TagClosing: %c\n", c );
436  switch( c )
437  {
438  case '<':
439  case '/':
440  case '!':
441  case '?':
442  case '&':
443  cleanup();
444  return static_cast<int>( i );
445  break;
446  case ':':
447  if( !m_haveTagPrefix )
448  {
449  m_haveTagPrefix = true;
450  m_tagPrefix = m_tag;
451  m_tag = EmptyString;
452  }
453  else
454  {
455  cleanup();
456  return static_cast<int>( i );
457  }
458  break;
459  case '>':
460  if( !closeTag() )
461  {
462 // printf( "here\n" );
463  cleanup();
464  return static_cast<int>( i );
465  }
466  m_state = InterTag;
467  break;
468  default:
469  m_tag += c;
470  break;
471  }
472  break;
473  case TagNameComplete: // a tag name is complete, expect tag close or attribs
474 // printf( "TagNameComplete: %c\n", c );
475  if( isWhitespace( c ) )
476  break;
477 
478  switch( c )
479  {
480  case '<':
481  case '!':
482  case '&':
483  cleanup();
484  return static_cast<int>( i );
485  break;
486  case '/':
487  m_state = TagOpeningSlash;
488  break;
489  case '>':
490  if( m_preamble == 1 )
491  {
492  cleanup();
493  return static_cast<int>( i );
494  }
495  m_state = TagInside;
496  addTag();
497  break;
498  case '?':
499  if( m_preamble == 1 )
500  m_preamble = 2;
501  else
502  {
503  cleanup();
504  return static_cast<int>( i );
505  }
506  break;
507  default:
508  m_attrib += c;
509  m_state = TagAttribute;
510  break;
511  }
512  break;
513  case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet
514 // printf( "TagAttribute: %c\n", c );
515  if( isWhitespace( c ) )
516  {
517  m_state = TagAttributeComplete;
518  break;
519  }
520 
521  switch( c )
522  {
523  case '<':
524  case '/':
525  case '>':
526  case '?':
527  case '!':
528  case '&':
529  cleanup();
530  return static_cast<int>( i );
531  break;
532  case '=':
533  m_state = TagAttributeEqual;
534  break;
535  case ':':
536  if( !m_haveAttribPrefix && m_attrib != XMLNS )
537  {
538  m_haveAttribPrefix = true;
539  m_attribPrefix = m_attrib;
540  m_attrib = EmptyString;
541  }
542  else if( m_attrib == XMLNS )
543  {
544  m_attribIsXmlns = true;
545  m_attrib = EmptyString;
546  }
547  else
548  {
549  cleanup();
550  return static_cast<int>( i );
551  }
552  break;
553  default:
554  m_attrib += c;
555  }
556  break;
557  case TagAttributeComplete: // we're expecting an equals sign or ws
558 // printf( "TagAttributeComplete: %c\n", c );
559  if( isWhitespace( c ) )
560  break;
561 
562  switch( c )
563  {
564  case '=':
565  m_state = TagAttributeEqual;
566  break;
567  default:
568  cleanup();
569  return static_cast<int>( i );
570  break;
571  }
572  break;
573  case TagAttributeEqual: // we have found an equals sign
574 // printf( "TagAttributeEqual: %c\n", c );
575  if( isWhitespace( c ) )
576  break;
577 
578  switch( c )
579  {
580  case '"':
581  m_quote = true;
582  case '\'':
583  m_state = TagAttributeValue;
584  break;
585  default:
586  cleanup();
587  return static_cast<int>( i );
588  break;
589  }
590  break;
591  case TagAttributeValue: // we're expecting value data
592 // printf( "TagValue: %c\n", c );
593  switch( c )
594  {
595  case '<':
596  cleanup();
597  return static_cast<int>( i );
598  break;
599  case '\'':
600  if( m_quote )
601  {
602  m_value += c;
603  break;
604  }
605  case '"':
606  addAttribute();
607  m_state = TagNameAlmostComplete;
608  m_quote = false;
609  break;
610  case '&':
611 // printf( "TagAttributeValue, calling decode\n" );
612  switch( decode( i, data ) )
613  {
614  case DecodeValid:
615  break;
616  case DecodeInvalid:
617  cleanup();
618  return static_cast<int>( i );
619  case DecodeInsufficient:
620  return -1;
621  }
622  break;
623  case '>':
624  default:
625  m_value += c;
626  }
627  break;
628  case TagNameAlmostComplete:
629 // printf( "TagAttributeEqual: %c\n", c );
630  if( isWhitespace( c ) )
631  {
632  m_state = TagNameComplete;
633  break;
634  }
635 
636  switch( c )
637  {
638  case '/':
639  m_state = TagOpeningSlash;
640  break;
641  case '>':
642  if( m_preamble == 1 )
643  {
644  cleanup();
645  return static_cast<int>( i );
646  }
647  m_state = TagInside;
648  addTag();
649  break;
650  case '?':
651  if( m_preamble == 1 )
652  m_preamble = 2;
653  else
654  {
655  cleanup();
656  return static_cast<int>( i );
657  }
658  break;
659  default:
660  cleanup();
661  return static_cast<int>( i );
662  break;
663  }
664  break;
665  default:
666 // printf( "default action!?\n" );
667  break;
668  }
669 // printf( "parser state: %d\n", m_state );
670  }
671 
672  return -1;
673  }
674 
675  void Parser::addTag()
676  {
677  if( !m_root )
678  {
679 // printf( "created Tag named %s, ", m_tag.c_str() );
680  m_root = new Tag( m_tag );
681  m_current = m_root;
682  }
683  else
684  {
685 // printf( "created Tag named %s, ", m_tag.c_str() );
686  m_current = new Tag( m_current, m_tag );
687  }
688 
689  if( m_haveTagPrefix )
690  {
691 // printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
692  m_current->setPrefix( m_tagPrefix );
693  m_haveTagPrefix = false;
694  }
695 
696  if( m_attribs.size() )
697  {
698  m_current->setAttributes( m_attribs );
699 // printf( "added %d attributes, ", m_attribs.size() );
700  m_attribs.clear();
701  }
702 
703  if( m_xmlnss )
704  {
705 // printf( "have ns decls\n" );
706 // StringMap::const_iterator it = m_xmlnss->begin();
707 // for( ; it != m_xmlnss->end(); ++it )
708 // printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
709  m_current->setXmlns( m_xmlnss );
710  m_xmlnss = 0;
711  }
712 
713  m_current->setXmlns( m_xmlns );
714  m_xmlns = EmptyString;
715 
716  if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
717  {
718  streamEvent( m_root );
719  cleanup( m_deleteRoot );
720  return;
721  }
722 // else
723 // printf( "%s, ", m_root->xml().c_str() );
724 
725  if( m_root && m_root == m_current && m_tagPrefix == "stream" )
726  m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
727 
728  if( m_tag == "xml" && m_preamble == 2 )
729  cleanup();
730  }
731 
732  void Parser::addAttribute()
733  {
734  Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
735  if( m_attribIsXmlns )
736  {
737  if( !m_xmlnss )
738  m_xmlnss = new StringMap();
739 
740  (*m_xmlnss)[m_attrib] = m_value;
741  attr->setPrefix( XMLNS );
742  }
743  else
744  {
745 // printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
746  if( !m_attribPrefix.empty() )
747  attr->setPrefix( m_attribPrefix );
748  if( m_attrib == XMLNS )
749  m_xmlns = m_value;
750  }
751  m_attribs.push_back( attr );
752  m_attrib = EmptyString;
753  m_value = EmptyString;
754  m_attribPrefix = EmptyString;
755  m_haveAttribPrefix = false;
756  m_attribIsXmlns = false;
757  }
758 
759  void Parser::addCData()
760  {
761  if( m_current && !m_cdata.empty() )
762  {
763  m_current->addCData( m_cdata );
764 // printf( "added cdata %s to %s: %s\n",
765 // m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
766  m_cdata = EmptyString;
767  }
768  }
769 
770  bool Parser::closeTag()
771  {
772 // printf( "about to close, " );
773 
774  if( m_tag == "stream" && m_tagPrefix == "stream" )
775  return true;
776 
777  if( !m_current || m_current->name() != m_tag
778  || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
779  {
780 // printf( "current xml: %s\n", m_current->xml().c_str() );
781 // printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
782 // printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
783  return false;
784  }
785 
786 // printf( "m_current: %s, ", m_current->name().c_str() );
787 // printf( "m_tag: %s, ", m_tag.c_str() );
788 
789  m_tagPrefix = EmptyString;
790  m_haveTagPrefix = false;
791 
792  if( m_current->parent() )
793  m_current = m_current->parent();
794  else
795  {
796 // printf( "pushing upstream\n" );
797  streamEvent( m_root );
798  m_return = ParseOK;
799  cleanup( m_deleteRoot );
800  }
801 
802  return true;
803  }
804 
805  void Parser::cleanup( bool deleteRoot )
806  {
807  if( deleteRoot )
808  delete m_root;
809  if( m_nullRoot )
810  m_root = 0;
811  m_current = 0;
812  delete m_xmlnss;
813  m_xmlnss = 0;
814  m_cdata = EmptyString;
815  m_tag = EmptyString;
816  m_attrib = EmptyString;
817  m_attribPrefix = EmptyString;
818  m_tagPrefix = EmptyString;
819  m_haveAttribPrefix = false;
820  m_haveTagPrefix = false;
821  m_value = EmptyString;
822  m_xmlns = EmptyString;
823  util::clearList( m_attribs );
824  m_attribs.clear();
825  m_state = Initial;
826  m_preamble = 0;
827  }
828 
829  bool Parser::isValid( unsigned char c )
830  {
831  return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
832  }
833 
834  bool Parser::isWhitespace( unsigned char c )
835  {
836  return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
837  }
838 
839  void Parser::streamEvent( Tag* tag )
840  {
841  if( m_tagHandler )
842  m_tagHandler->handleTag( tag );
843  }
844 
845  Tag* Parser::parse( std::string& data )
846  {
847  Parser p( 0, false );
848  p.m_nullRoot = false;
849  int i = p.feed( data );
850  if( i == -1 && p.m_return == ParseOK )
851  return p.m_root->clone();
852  else
853  return 0;
854  }
855 
856 }