gloox  1.0.1
parser.cpp
1 /*
2  Copyright (c) 2004-2012 by Jakob Schroeter <js@camaya.net>
3  This file is part of the gloox library. http://camaya.net/gloox
4 
5  This software is distributed under a license. The full license
6  agreement can be found in the file LICENSE in this distribution.
7  This software may not be copied, modified, sold or distributed
8  other than expressed in the named license agreement.
9 
10  This software is distributed without any warranty.
11 */
12 
13 #include "gloox.h"
14 #include "util.h"
15 #include "parser.h"
16 
17 #include <cstdlib>
18 
19 namespace gloox
20 {
21 
22  Parser::Parser( TagHandler* ph, bool deleteRoot )
23  : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
24  m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
25  m_attribIsXmlns( false ), m_deleteRoot( deleteRoot )
26  {
27  }
28 
30  {
31  delete m_root;
32  delete m_xmlnss;
33  }
34 
35  Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
36  {
37  std::string::size_type p = data.find( ';', pos );
38  std::string::size_type diff = p - pos;
39 
40  if( p == std::string::npos )
41  {
42  m_backBuffer = data.substr( pos );
43  return DecodeInsufficient;
44  }
45 
46  if( diff < 3 || diff > 9 )
47  return DecodeInvalid;
48 
49  std::string rep;
50  switch( data[pos + 1] )
51  {
52  case '#':
53  {
54  int base = 10;
55  int idx = 2;
56 
57  if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
58  {
59  base = 16;
60  idx = 3;
61  }
62 
63  char* end;
64  const long int val = std::strtol( data.data() + pos + idx, &end, base );
65  if( *end != ';' || val < 0 )
66  return DecodeInvalid;
67 
68  if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
69  {
70  rep += char( val );
71  }
72  else if( val >= 0x80 && val <= 0x7FF )
73  {
74  rep += char( 192 + ( val >> 6 ) );
75  rep += char( 128 + ( val % 64 ) );
76  }
77  else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
78  {
79  rep += char( 224 + ( val >> 12 ) );
80  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
81  rep += char( 128 + ( val % 64 ) );
82  }
83  else if( val >= 0x100000 && val < 0x10FFFF )
84  {
85  rep += char( 240 + ( val >> 18 ) );
86  rep += char( 128 + ( ( val >> 12 ) % 64 ) );
87  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
88  rep += char( 128 + ( val % 64 ) );
89  }
90  else
91  return DecodeInvalid;
92  }
93  break;
94  case 'l':
95  if( diff == 3 && data[pos + 2] == 't' )
96  rep += '<';
97  else
98  return DecodeInvalid;
99  break;
100  case 'g':
101  if( diff == 3 && data[pos + 2] == 't' )
102  rep += '>';
103  else
104  return DecodeInvalid;
105  break;
106  case 'a':
107  if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
108  rep += '\'';
109  else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
110  rep += '&';
111  else
112  return DecodeInvalid;
113  break;
114  case 'q':
115  if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
116  rep += '"';
117  else
118  return DecodeInvalid;
119  break;
120  default:
121  return DecodeInvalid;
122  }
123 
124  switch( m_state )
125  {
126  case InterTag:
127  case TagInside:
128  m_cdata += rep;
129  break;
130  case TagAttributeValue:
131  m_value += rep;
132  break;
133  default:
134  break;
135  }
136  pos += diff;
137  return DecodeValid;
138  }
139 
140  Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
141  const std::string& needle )
142  {
143  if( pos + needle.length() <= data.length() )
144  {
145  if( !data.compare( pos, needle.length(), needle ) )
146  {
147  pos += needle.length() - 1;
148  return ForwardFound;
149  }
150  else
151  {
152  return ForwardNotFound;
153  }
154  }
155  else
156  {
157  m_backBuffer = data.substr( pos );
158  return ForwardInsufficientSize;
159  }
160  }
161 
162  int Parser::feed( std::string& data )
163  {
164  if( !m_backBuffer.empty() )
165  {
166  data.insert( 0, m_backBuffer );
167  m_backBuffer = EmptyString;
168  }
169 
170  std::string::size_type count = data.length();
171  for( std::string::size_type i = 0; i < count; ++i )
172  {
173  const unsigned char c = data[i];
174 // printf( "found char: %c, ", c );
175 
176  if( !isValid( c ) )
177  {
178  cleanup();
179  return static_cast<int>( i );
180  }
181 
182  switch( m_state )
183  {
184  case Initial:
185 // printf( "Initial: %c\n", c );
186  if( isWhitespace( c ) )
187  break;
188 
189  switch( c )
190  {
191  case '<':
192  m_state = TagOpening;
193  break;
194  default:
195  cleanup();
196  return static_cast<int>( i );
197  break;
198  }
199  break;
200  case InterTag:
201 // printf( "InterTag: %c\n", c );
202  m_tag = EmptyString;
203  if( isWhitespace( c ) )
204  {
205  m_state = TagInside;
206  if( m_current )
207  m_cdata += c;
208  break;
209  }
210 
211  switch( c )
212  {
213  case '&':
214 // printf( "InterTag, calling decode\n" );
215  switch( decode( i, data ) )
216  {
217  case DecodeValid:
218  m_state = TagInside;
219  break;
220  case DecodeInvalid:
221  cleanup();
222  return static_cast<int>( i );
223  case DecodeInsufficient:
224  return -1;
225  }
226  break;
227  case '<':
228  m_state = TagOpening;
229  break;
230  case '>':
231  default:
232  if( m_current )
233  {
234  m_cdata += c;
235  m_state = TagInside;
236  }
237  break;
238  }
239  break;
240  case TagOpening: // opening '<' has been found before
241 // printf( "TagOpening: %c\n", c );
242  if( isWhitespace( c ) )
243  break;
244 
245  switch( c )
246  {
247  case '<':
248  case '>':
249  case '&':
250  cleanup();
251  return static_cast<int>( i );
252  break;
253  case '/':
254  m_state = TagClosingSlash;
255  break;
256  case '?':
257  m_state = TagNameCollect;
258  m_preamble = 1;
259  break;
260  case '!':
261  switch( forwardScan( i, data, "![CDATA[" ) )
262  {
263  case ForwardFound:
264  m_state = TagCDATASection;
265  break;
266  case ForwardNotFound:
267  cleanup();
268  return static_cast<int>( i );
269  case ForwardInsufficientSize:
270  return -1;
271  }
272  break;
273  default:
274  m_tag += c;
275  m_state = TagNameCollect;
276  break;
277  }
278  break;
279  case TagCDATASection:
280  switch( c )
281  {
282  case ']':
283  switch( forwardScan( i, data, "]]>" ) )
284  {
285  case ForwardFound:
286  m_state = TagInside;
287  break;
288  case ForwardNotFound:
289  m_cdata += c;
290  break;
291  case ForwardInsufficientSize:
292  return -1;
293  }
294  break;
295  default:
296  m_cdata += c;
297  break;
298  }
299  break;
300  case TagNameCollect: // we're collecting the tag's name, we have at least one octet already
301 // printf( "TagNameCollect: %c\n", c );
302  if( isWhitespace( c ) )
303  {
304  m_state = TagNameComplete;
305  break;
306  }
307 
308  switch( c )
309  {
310  case '<':
311  case '?':
312  case '!':
313  case '&':
314  cleanup();
315  return static_cast<int>( i );
316  break;
317  case '/':
318  m_state = TagOpeningSlash;
319  break;
320  case '>':
321  addTag();
322  m_state = TagInside;
323  break;
324  case ':':
325  if( !m_haveTagPrefix )
326  {
327  m_haveTagPrefix = true;
328  m_tagPrefix = m_tag;
329  m_tag = EmptyString;
330  }
331  else
332  {
333  cleanup();
334  return static_cast<int>( i );
335  }
336  break;
337  default:
338  m_tag += c;
339  break;
340  }
341  break;
342  case TagInside: // we're inside a tag, expecting a child tag or cdata
343 // printf( "TagInside: %c\n", c );
344  m_tag = EmptyString;
345  switch( c )
346  {
347  case '<':
348  addCData();
349  m_state = TagOpening;
350  break;
351  case '&':
352 // printf( "TagInside, calling decode\n" );
353  switch( decode( i, data ) )
354  {
355  case DecodeValid:
356  break;
357  case DecodeInvalid:
358  cleanup();
359  return static_cast<int>( i );
360  case DecodeInsufficient:
361  return -1;
362  }
363  break;
364  default:
365  m_cdata += c;
366  break;
367  }
368  break;
369  case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag
370 // printf( "TagOpeningSlash: %c\n", c );
371  if( isWhitespace( c ) )
372  break;
373 
374  if( c == '>' )
375  {
376  addTag();
377  if( !closeTag() )
378  {
379 // printf( "noipe, here\n" );
380  cleanup();
381  return static_cast<int>( i );
382  }
383 
384  m_state = InterTag;
385  }
386  else
387  {
388  cleanup();
389  return static_cast<int>( i );
390  }
391  break;
392  case TagClosingSlash: // we have found the '/' of a closing tag
393 // printf( "TagClosingSlash: %c\n", c );
394  if( isWhitespace( c ) )
395  break;
396 
397  switch( c )
398  {
399  case '>':
400  case '<':
401  case '/':
402  cleanup();
403  return static_cast<int>( i );
404  break;
405  default:
406  m_tag += c;
407  m_state = TagClosing;
408  break;
409  }
410  break;
411  case TagClosing: // we're collecting the name of a closing tag
412 // printf( "TagClosing: %c\n", c );
413  switch( c )
414  {
415  case '<':
416  case '/':
417  case '!':
418  case '?':
419  case '&':
420  cleanup();
421  return static_cast<int>( i );
422  break;
423  case ':':
424  if( !m_haveTagPrefix )
425  {
426  m_haveTagPrefix = true;
427  m_tagPrefix = m_tag;
428  m_tag = EmptyString;
429  }
430  else
431  {
432  cleanup();
433  return static_cast<int>( i );
434  }
435  break;
436  case '>':
437  if( !closeTag() )
438  {
439 // printf( "here\n" );
440  cleanup();
441  return static_cast<int>( i );
442  }
443  m_state = InterTag;
444  break;
445  default:
446  m_tag += c;
447  break;
448  }
449  break;
450  case TagNameComplete: // a tag name is complete, expect tag close or attribs
451 // printf( "TagNameComplete: %c\n", c );
452  if( isWhitespace( c ) )
453  break;
454 
455  switch( c )
456  {
457  case '<':
458  case '!':
459  case '&':
460  cleanup();
461  return static_cast<int>( i );
462  break;
463  case '/':
464  m_state = TagOpeningSlash;
465  break;
466  case '>':
467  if( m_preamble == 1 )
468  {
469  cleanup();
470  return static_cast<int>( i );
471  }
472  m_state = TagInside;
473  addTag();
474  break;
475  case '?':
476  if( m_preamble == 1 )
477  m_preamble = 2;
478  else
479  {
480  cleanup();
481  return static_cast<int>( i );
482  }
483  break;
484  default:
485  m_attrib += c;
486  m_state = TagAttribute;
487  break;
488  }
489  break;
490  case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet
491 // printf( "TagAttribute: %c\n", c );
492  if( isWhitespace( c ) )
493  {
494  m_state = TagAttributeComplete;
495  break;
496  }
497 
498  switch( c )
499  {
500  case '<':
501  case '/':
502  case '>':
503  case '?':
504  case '!':
505  case '&':
506  cleanup();
507  return static_cast<int>( i );
508  break;
509  case '=':
510  m_state = TagAttributeEqual;
511  break;
512  case ':':
513  if( !m_haveAttribPrefix && m_attrib != XMLNS )
514  {
515  m_haveAttribPrefix = true;
516  m_attribPrefix = m_attrib;
517  m_attrib = EmptyString;
518  }
519  else if( m_attrib == XMLNS )
520  {
521  m_attribIsXmlns = true;
522  m_attrib = EmptyString;
523  }
524  else
525  {
526  cleanup();
527  return static_cast<int>( i );
528  }
529  break;
530  default:
531  m_attrib += c;
532  }
533  break;
534  case TagAttributeComplete: // we're expecting an equals sign or ws
535 // printf( "TagAttributeComplete: %c\n", c );
536  if( isWhitespace( c ) )
537  break;
538 
539  switch( c )
540  {
541  case '=':
542  m_state = TagAttributeEqual;
543  break;
544  default:
545  cleanup();
546  return static_cast<int>( i );
547  break;
548  }
549  break;
550  case TagAttributeEqual: // we have found an equals sign
551 // printf( "TagAttributeEqual: %c\n", c );
552  if( isWhitespace( c ) )
553  break;
554 
555  switch( c )
556  {
557  case '"':
558  m_quote = true;
559  case '\'':
560  m_state = TagAttributeValue;
561  break;
562  default:
563  cleanup();
564  return static_cast<int>( i );
565  break;
566  }
567  break;
568  case TagAttributeValue: // we're expecting value data
569 // printf( "TagValue: %c\n", c );
570  switch( c )
571  {
572  case '<':
573  cleanup();
574  return static_cast<int>( i );
575  break;
576  case '\'':
577  if( m_quote )
578  {
579  m_value += c;
580  break;
581  }
582  case '"':
583  addAttribute();
584  m_state = TagNameAlmostComplete;
585  m_quote = false;
586  break;
587  case '&':
588 // printf( "TagAttributeValue, calling decode\n" );
589  switch( decode( i, data ) )
590  {
591  case DecodeValid:
592  break;
593  case DecodeInvalid:
594  cleanup();
595  return static_cast<int>( i );
596  case DecodeInsufficient:
597  return -1;
598  }
599  break;
600  case '>':
601  default:
602  m_value += c;
603  }
604  break;
605  case TagNameAlmostComplete:
606 // printf( "TagAttributeEqual: %c\n", c );
607  if( isWhitespace( c ) )
608  {
609  m_state = TagNameComplete;
610  break;
611  }
612 
613  switch( c )
614  {
615  case '/':
616  m_state = TagOpeningSlash;
617  break;
618  case '>':
619  if( m_preamble == 1 )
620  {
621  cleanup();
622  return static_cast<int>( i );
623  }
624  m_state = TagInside;
625  addTag();
626  break;
627  case '?':
628  if( m_preamble == 1 )
629  m_preamble = 2;
630  else
631  {
632  cleanup();
633  return static_cast<int>( i );
634  }
635  break;
636  default:
637  cleanup();
638  return static_cast<int>( i );
639  break;
640  }
641  break;
642  default:
643 // printf( "default action!?\n" );
644  break;
645  }
646 // printf( "parser state: %d\n", m_state );
647  }
648 
649  return -1;
650  }
651 
652  void Parser::addTag()
653  {
654  if( !m_root )
655  {
656 // printf( "created Tag named %s, ", m_tag.c_str() );
657  m_root = new Tag( m_tag );
658  m_current = m_root;
659  }
660  else
661  {
662 // printf( "created Tag named %s, ", m_tag.c_str() );
663  m_current = new Tag( m_current, m_tag );
664  }
665 
666  if( m_haveTagPrefix )
667  {
668 // printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
669  m_current->setPrefix( m_tagPrefix );
670  m_haveTagPrefix = false;
671  }
672 
673  if( m_attribs.size() )
674  {
675  m_current->setAttributes( m_attribs );
676 // printf( "added %d attributes, ", m_attribs.size() );
677  m_attribs.clear();
678  }
679 
680  if( m_xmlnss )
681  {
682 // printf( "have ns decls\n" );
683 // StringMap::const_iterator it = m_xmlnss->begin();
684 // for( ; it != m_xmlnss->end(); ++it )
685 // printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
686  m_current->setXmlns( m_xmlnss );
687  m_xmlnss = 0;
688  }
689 
690  m_current->setXmlns( m_xmlns );
691  m_xmlns = EmptyString;
692 
693  if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
694  {
695  streamEvent( m_root );
696  cleanup( m_deleteRoot );
697  return;
698  }
699 // else
700 // printf( "%s, ", m_root->xml().c_str() );
701 
702  if( m_root && m_root == m_current && m_tagPrefix == "stream" )
703  m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
704 
705  if( m_tag == "xml" && m_preamble == 2 )
706  cleanup();
707  }
708 
709  void Parser::addAttribute()
710  {
711  Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
712  if( m_attribIsXmlns )
713  {
714  if( !m_xmlnss )
715  m_xmlnss = new StringMap();
716 
717  (*m_xmlnss)[m_attrib] = m_value;
718  attr->setPrefix( XMLNS );
719  }
720  else
721  {
722 // printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
723  if( !m_attribPrefix.empty() )
724  attr->setPrefix( m_attribPrefix );
725  if( m_attrib == XMLNS )
726  m_xmlns = m_value;
727  }
728  m_attribs.push_back( attr );
729  m_attrib = EmptyString;
730  m_value = EmptyString;
731  m_attribPrefix = EmptyString;
732  m_haveAttribPrefix = false;
733  m_attribIsXmlns = false;
734  }
735 
736  void Parser::addCData()
737  {
738  if( m_current && !m_cdata.empty() )
739  {
740  m_current->addCData( m_cdata );
741 // printf( "added cdata %s to %s: %s\n",
742 // m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
743  m_cdata = EmptyString;
744  }
745  }
746 
747  bool Parser::closeTag()
748  {
749 // printf( "about to close, " );
750 
751  if( m_tag == "stream" && m_tagPrefix == "stream" )
752  return true;
753 
754  if( !m_current || m_current->name() != m_tag
755  || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
756  {
757 // printf( "current xml: %s\n", m_current->xml().c_str() );
758 // printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
759 // printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
760  return false;
761  }
762 
763 // printf( "m_current: %s, ", m_current->name().c_str() );
764 // printf( "m_tag: %s, ", m_tag.c_str() );
765 
766  m_tagPrefix = EmptyString;
767  m_haveTagPrefix = false;
768 
769  if( m_current->parent() )
770  m_current = m_current->parent();
771  else
772  {
773 // printf( "pushing upstream\n" );
774  streamEvent( m_root );
775  cleanup( m_deleteRoot );
776  }
777 
778  return true;
779  }
780 
781  void Parser::cleanup( bool deleteRoot )
782  {
783  if( deleteRoot )
784  delete m_root;
785  m_root = 0;
786  m_current = 0;
787  delete m_xmlnss;
788  m_xmlnss = 0;
789  m_cdata = EmptyString;
790  m_tag = EmptyString;
791  m_attrib = EmptyString;
792  m_attribPrefix = EmptyString;
793  m_tagPrefix = EmptyString;
794  m_haveAttribPrefix = false;
795  m_haveTagPrefix = false;
796  m_value = EmptyString;
797  m_xmlns = EmptyString;
798  util::clearList( m_attribs );
799  m_attribs.clear();
800  m_state = Initial;
801  m_preamble = 0;
802  }
803 
804  bool Parser::isValid( unsigned char c )
805  {
806  return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
807  }
808 
809  bool Parser::isWhitespace( unsigned char c )
810  {
811  return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
812  }
813 
814  void Parser::streamEvent( Tag* tag )
815  {
816  if( m_tagHandler )
817  m_tagHandler->handleTag( tag );
818  }
819 
820 }