gloox  1.0.27
parser.cpp
1 /*
2  Copyright (c) 2004-2023 by Jakob Schröter <js@camaya.net>
3  This file is part of the gloox library. http://camaya.net/gloox
4 
5  This software is distributed under a license. The full license
6  agreement can be found in the file LICENSE in this distribution.
7  This software may not be copied, modified, sold or distributed
8  other than expressed in the named license agreement.
9 
10  This software is distributed without any warranty.
11 */
12 
13 #include "gloox.h"
14 #include "util.h"
15 #include "parser.h"
16 
17 #include <cstdlib>
18 
19 namespace gloox
20 {
21 
22  Parser::Parser( TagHandler* ph, bool deleteRoot )
23  : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
24  m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
25  m_attribIsXmlns( false ), m_deleteRoot( deleteRoot )
26  {
27  }
28 
30  {
31  cleanup( true );
32  }
33 
34  Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
35  {
36  std::string::size_type p = data.find( ';', pos );
37  std::string::size_type diff = p - pos;
38 
39  if( p == std::string::npos )
40  {
41  m_backBuffer = data.substr( pos );
42  return DecodeInsufficient;
43  }
44 
45  if( diff < 3 || diff > 9 )
46  return DecodeInvalid;
47 
48  std::string rep;
49  switch( data[pos + 1] )
50  {
51  case '#':
52  {
53  int base = 10;
54  int idx = 2;
55 
56  if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
57  {
58  base = 16;
59  idx = 3;
60  }
61 
62  char* end;
63  const long int val = std::strtol( data.data() + pos + idx, &end, base );
64  if( *end != ';' || val < 0 )
65  return DecodeInvalid;
66 
67  if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
68  {
69  rep += char( val );
70  }
71  else if( val >= 0x80 && val <= 0x7FF )
72  {
73  rep += char( 192 + ( val >> 6 ) );
74  rep += char( 128 + ( val % 64 ) );
75  }
76  else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
77  {
78  rep += char( 224 + ( val >> 12 ) );
79  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
80  rep += char( 128 + ( val % 64 ) );
81  }
82  else if( val >= 0x100000 && val < 0x10FFFF )
83  {
84  rep += char( 240 + ( val >> 18 ) );
85  rep += char( 128 + ( ( val >> 12 ) % 64 ) );
86  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
87  rep += char( 128 + ( val % 64 ) );
88  }
89  else
90  return DecodeInvalid;
91  }
92  break;
93  case 'l':
94  if( diff == 3 && data[pos + 2] == 't' )
95  rep += '<';
96  else
97  return DecodeInvalid;
98  break;
99  case 'g':
100  if( diff == 3 && data[pos + 2] == 't' )
101  rep += '>';
102  else
103  return DecodeInvalid;
104  break;
105  case 'a':
106  if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
107  rep += '\'';
108  else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
109  rep += '&';
110  else
111  return DecodeInvalid;
112  break;
113  case 'q':
114  if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
115  rep += '"';
116  else
117  return DecodeInvalid;
118  break;
119  default:
120  return DecodeInvalid;
121  }
122 
123  switch( m_state )
124  {
125  case InterTag:
126  case TagInside:
127  m_cdata += rep;
128  break;
129  case TagAttributeValue:
130  m_value += rep;
131  break;
132  default:
133  break;
134  }
135  pos += diff;
136  return DecodeValid;
137  }
138 
139  Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
140  const std::string& needle )
141  {
142  if( pos + needle.length() <= data.length() )
143  {
144  if( !data.compare( pos, needle.length(), needle ) )
145  {
146  pos += needle.length() - 1;
147  return ForwardFound;
148  }
149  else
150  {
151  return ForwardNotFound;
152  }
153  }
154  else
155  {
156  m_backBuffer = data.substr( pos );
157  return ForwardInsufficientSize;
158  }
159  }
160 
161  int Parser::feed( std::string& data )
162  {
163  if( !m_backBuffer.empty() )
164  {
165  data.insert( 0, m_backBuffer );
166  m_backBuffer = EmptyString;
167  }
168 
169  std::string::size_type count = data.length();
170  for( std::string::size_type i = 0; i < count; ++i )
171  {
172  const unsigned char c = data[i];
173 // printf( "found char: %c, ", c );
174 
175  switch( m_state )
176  {
177  case Initial:
178 // printf( "Initial: %c\n", c );
179  if( isWhitespace( c ) )
180  break;
181 
182  switch( c )
183  {
184  case '<':
185  m_state = TagOpening;
186  break;
187  default:
188  cleanup();
189  return static_cast<int>( i );
190  break;
191  }
192  break;
193  case InterTag:
194 // printf( "InterTag: %c\n", c );
195  m_tag = EmptyString;
196  if( isWhitespace( c ) )
197  {
198  m_state = TagInside;
199  if( m_current )
200  m_cdata += c;
201  break;
202  }
203 
204  switch( c )
205  {
206  case '&':
207 // printf( "InterTag, calling decode\n" );
208  switch( decode( i, data ) )
209  {
210  case DecodeValid:
211  m_state = TagInside;
212  break;
213  case DecodeInvalid:
214  cleanup();
215  return static_cast<int>( i );
216  case DecodeInsufficient:
217  return -1;
218  }
219  break;
220  case '<':
221  m_state = TagOpening;
222  break;
223  case '>':
224  default:
225  if( m_current )
226  {
227  m_cdata += c;
228  m_state = TagInside;
229  }
230  break;
231  }
232  break;
233  case TagOpening: // opening '<' has been found before
234 // printf( "TagOpening: %c\n", c );
235  if( isWhitespace( c ) )
236  break;
237 
238  switch( c )
239  {
240  case '<':
241  case '>':
242  case '&':
243  cleanup();
244  return static_cast<int>( i );
245  break;
246  case '/':
247  m_state = TagClosingSlash;
248  break;
249  case '?':
250  m_state = TagNameCollect;
251  m_preamble = 1;
252  break;
253  case '!':
254  switch( forwardScan( i, data, "![CDATA[" ) )
255  {
256  case ForwardFound:
257  m_state = TagCDATASection;
258  break;
259  case ForwardNotFound:
260  cleanup();
261  return static_cast<int>( i );
262  case ForwardInsufficientSize:
263  return -1;
264  }
265  break;
266  default:
267  m_tag += c;
268  m_state = TagNameCollect;
269  break;
270  }
271  break;
272  case TagCDATASection:
273  switch( c )
274  {
275  case ']':
276  switch( forwardScan( i, data, "]]>" ) )
277  {
278  case ForwardFound:
279  m_state = TagInside;
280  break;
281  case ForwardNotFound:
282  m_cdata += c;
283  break;
284  case ForwardInsufficientSize:
285  return -1;
286  }
287  break;
288  default:
289  m_cdata += c;
290  break;
291  }
292  break;
293  case TagNameCollect: // we're collecting the tag's name, we have at least one octet already
294 // printf( "TagNameCollect: %c\n", c );
295  if( isWhitespace( c ) )
296  {
297  m_state = TagNameComplete;
298  break;
299  }
300 
301  switch( c )
302  {
303  case '<':
304  case '?':
305  case '!':
306  case '&':
307  cleanup();
308  return static_cast<int>( i );
309  break;
310  case '/':
311  m_state = TagOpeningSlash;
312  break;
313  case '>':
314  addTag();
315  m_state = TagInside;
316  break;
317  case ':':
318  if( !m_haveTagPrefix )
319  {
320  m_haveTagPrefix = true;
321  m_tagPrefix = m_tag;
322  m_tag = EmptyString;
323  }
324  else
325  {
326  cleanup();
327  return static_cast<int>( i );
328  }
329  break;
330  default:
331  m_tag += c;
332  break;
333  }
334  break;
335  case TagInside: // we're inside a tag, expecting a child tag or cdata
336 // printf( "TagInside: %c\n", c );
337  m_tag = EmptyString;
338  switch( c )
339  {
340  case '<':
341  addCData();
342  m_state = TagOpening;
343  break;
344  case '&':
345 // printf( "TagInside, calling decode\n" );
346  switch( decode( i, data ) )
347  {
348  case DecodeValid:
349  break;
350  case DecodeInvalid:
351  cleanup();
352  return static_cast<int>( i );
353  case DecodeInsufficient:
354  return -1;
355  }
356  break;
357  default:
358  m_cdata += c;
359  break;
360  }
361  break;
362  case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag
363 // printf( "TagOpeningSlash: %c\n", c );
364  if( isWhitespace( c ) )
365  break;
366 
367  if( c == '>' )
368  {
369  addTag();
370  if( !closeTag() )
371  {
372 // printf( "noipe, here\n" );
373  cleanup();
374  return static_cast<int>( i );
375  }
376 
377  m_state = InterTag;
378  }
379  else
380  {
381  cleanup();
382  return static_cast<int>( i );
383  }
384  break;
385  case TagClosingSlash: // we have found the '/' of a closing tag
386 // printf( "TagClosingSlash: %c\n", c );
387  if( isWhitespace( c ) )
388  break;
389 
390  switch( c )
391  {
392  case '>':
393  case '<':
394  case '/':
395  cleanup();
396  return static_cast<int>( i );
397  break;
398  default:
399  m_tag += c;
400  m_state = TagClosing;
401  break;
402  }
403  break;
404  case TagClosing: // we're collecting the name of a closing tag
405 // printf( "TagClosing: %c\n", c );
406  switch( c )
407  {
408  case '<':
409  case '/':
410  case '!':
411  case '?':
412  case '&':
413  cleanup();
414  return static_cast<int>( i );
415  break;
416  case ':':
417  if( !m_haveTagPrefix )
418  {
419  m_haveTagPrefix = true;
420  m_tagPrefix = m_tag;
421  m_tag = EmptyString;
422  }
423  else
424  {
425  cleanup();
426  return static_cast<int>( i );
427  }
428  break;
429  case '>':
430  if( !closeTag() )
431  {
432 // printf( "here\n" );
433  cleanup();
434  return static_cast<int>( i );
435  }
436  m_state = InterTag;
437  break;
438  default:
439  m_tag += c;
440  break;
441  }
442  break;
443  case TagNameComplete: // a tag name is complete, expect tag close or attribs
444 // printf( "TagNameComplete: %c\n", c );
445  if( isWhitespace( c ) )
446  break;
447 
448  switch( c )
449  {
450  case '<':
451  case '!':
452  case '&':
453  cleanup();
454  return static_cast<int>( i );
455  break;
456  case '/':
457  m_state = TagOpeningSlash;
458  break;
459  case '>':
460  if( m_preamble == 1 )
461  {
462  cleanup();
463  return static_cast<int>( i );
464  }
465  m_state = TagInside;
466  addTag();
467  break;
468  case '?':
469  if( m_preamble == 1 )
470  m_preamble = 2;
471  else
472  {
473  cleanup();
474  return static_cast<int>( i );
475  }
476  break;
477  default:
478  m_attrib += c;
479  m_state = TagAttribute;
480  break;
481  }
482  break;
483  case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet
484 // printf( "TagAttribute: %c\n", c );
485  if( isWhitespace( c ) )
486  {
487  m_state = TagAttributeComplete;
488  break;
489  }
490 
491  switch( c )
492  {
493  case '<':
494  case '/':
495  case '>':
496  case '?':
497  case '!':
498  case '&':
499  cleanup();
500  return static_cast<int>( i );
501  break;
502  case '=':
503  m_state = TagAttributeEqual;
504  break;
505  case ':':
506  if( !m_haveAttribPrefix && m_attrib != XMLNS )
507  {
508  m_haveAttribPrefix = true;
509  m_attribPrefix = m_attrib;
510  m_attrib = EmptyString;
511  }
512  else if( m_attrib == XMLNS )
513  {
514  m_attribIsXmlns = true;
515  m_attrib = EmptyString;
516  }
517  else
518  {
519  cleanup();
520  return static_cast<int>( i );
521  }
522  break;
523  default:
524  m_attrib += c;
525  }
526  break;
527  case TagAttributeComplete: // we're expecting an equals sign or ws
528 // printf( "TagAttributeComplete: %c\n", c );
529  if( isWhitespace( c ) )
530  break;
531 
532  switch( c )
533  {
534  case '=':
535  m_state = TagAttributeEqual;
536  break;
537  default:
538  cleanup();
539  return static_cast<int>( i );
540  break;
541  }
542  break;
543  case TagAttributeEqual: // we have found an equals sign
544 // printf( "TagAttributeEqual: %c\n", c );
545  if( isWhitespace( c ) )
546  break;
547 
548  switch( c )
549  {
550  case '"':
551  m_quote = true;
552  case '\'':
553  m_state = TagAttributeValue;
554  break;
555  default:
556  cleanup();
557  return static_cast<int>( i );
558  break;
559  }
560  break;
561  case TagAttributeValue: // we're expecting value data
562 // printf( "TagValue: %c\n", c );
563  switch( c )
564  {
565  case '<':
566  cleanup();
567  return static_cast<int>( i );
568  break;
569  case '\'':
570  if( m_quote )
571  {
572  m_value += c;
573  break;
574  }
575  case '"':
576  addAttribute();
577  m_state = TagNameAlmostComplete;
578  m_quote = false;
579  break;
580  case '&':
581 // printf( "TagAttributeValue, calling decode\n" );
582  switch( decode( i, data ) )
583  {
584  case DecodeValid:
585  break;
586  case DecodeInvalid:
587  cleanup();
588  return static_cast<int>( i );
589  case DecodeInsufficient:
590  return -1;
591  }
592  break;
593  case '>':
594  default:
595  m_value += c;
596  }
597  break;
598  case TagNameAlmostComplete:
599 // printf( "TagAttributeEqual: %c\n", c );
600  if( isWhitespace( c ) )
601  {
602  m_state = TagNameComplete;
603  break;
604  }
605 
606  switch( c )
607  {
608  case '/':
609  m_state = TagOpeningSlash;
610  break;
611  case '>':
612  if( m_preamble == 1 )
613  {
614  cleanup();
615  return static_cast<int>( i );
616  }
617  m_state = TagInside;
618  addTag();
619  break;
620  case '?':
621  if( m_preamble == 1 )
622  m_preamble = 2;
623  else
624  {
625  cleanup();
626  return static_cast<int>( i );
627  }
628  break;
629  default:
630  cleanup();
631  return static_cast<int>( i );
632  break;
633  }
634  break;
635  default:
636 // printf( "default action!?\n" );
637  break;
638  }
639 // printf( "parser state: %d\n", m_state );
640  }
641 
642  return -1;
643  }
644 
645  void Parser::addTag()
646  {
647  if( !m_root )
648  {
649 // printf( "created Tag named %s, ", m_tag.c_str() );
650  m_root = new Tag( m_tag );
651  m_current = m_root;
652  }
653  else
654  {
655 // printf( "created Tag named %s, ", m_tag.c_str() );
656  m_current = new Tag( m_current, m_tag );
657  }
658 
659  if( m_haveTagPrefix )
660  {
661 // printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
662  m_current->setPrefix( m_tagPrefix );
663  m_haveTagPrefix = false;
664  }
665 
666  if( m_attribs.size() )
667  {
668  m_current->setAttributes( m_attribs );
669 // printf( "added %d attributes, ", m_attribs.size() );
670  m_attribs.clear();
671  }
672 
673  if( m_xmlnss )
674  {
675 // printf( "have ns decls\n" );
676 // StringMap::const_iterator it = m_xmlnss->begin();
677 // for( ; it != m_xmlnss->end(); ++it )
678 // printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
679  m_current->setXmlns( m_xmlnss );
680  m_xmlnss = 0;
681  }
682 
683  m_current->setXmlns( m_xmlns );
684  m_xmlns = EmptyString;
685 
686  if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
687  {
688  streamEvent( m_root );
689  cleanup( m_deleteRoot );
690  return;
691  }
692 // else
693 // printf( "%s, ", m_root->xml().c_str() );
694 
695  if( m_root && m_root == m_current && m_tagPrefix == "stream" )
696  m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
697 
698  if( m_tag == "xml" && m_preamble == 2 )
699  cleanup();
700  }
701 
702  void Parser::addAttribute()
703  {
704  Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
705  if( m_attribIsXmlns )
706  {
707  if( !m_xmlnss )
708  m_xmlnss = new StringMap();
709 
710  (*m_xmlnss)[m_attrib] = m_value;
711  attr->setPrefix( XMLNS );
712  }
713  else
714  {
715 // printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
716  if( !m_attribPrefix.empty() )
717  attr->setPrefix( m_attribPrefix );
718  if( m_attrib == XMLNS )
719  m_xmlns = m_value;
720  }
721  m_attribs.push_back( attr );
722  m_attrib = EmptyString;
723  m_value = EmptyString;
724  m_attribPrefix = EmptyString;
725  m_haveAttribPrefix = false;
726  m_attribIsXmlns = false;
727  }
728 
729  void Parser::addCData()
730  {
731  if( m_current && !m_cdata.empty() )
732  {
733  m_current->addCData( m_cdata );
734 // printf( "added cdata %s to %s: %s\n",
735 // m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
736  m_cdata = EmptyString;
737  }
738  }
739 
740  bool Parser::closeTag()
741  {
742 // printf( "about to close, " );
743 
744  if( m_tag == "stream" && m_tagPrefix == "stream" )
745  return true;
746 
747  if( !m_current || m_current->name() != m_tag
748  || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
749  {
750 // printf( "current xml: %s\n", m_current->xml().c_str() );
751 // printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
752 // printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
753  return false;
754  }
755 
756 // printf( "m_current: %s, ", m_current->name().c_str() );
757 // printf( "m_tag: %s, ", m_tag.c_str() );
758 
759  m_tagPrefix = EmptyString;
760  m_haveTagPrefix = false;
761 
762  if( m_current->parent() )
763  m_current = m_current->parent();
764  else
765  {
766 // printf( "pushing upstream\n" );
767  streamEvent( m_root );
768  cleanup( m_deleteRoot );
769  }
770 
771  return true;
772  }
773 
774  void Parser::cleanup( bool deleteRoot )
775  {
776  if( deleteRoot )
777  delete m_root;
778  m_root = 0;
779  m_current = 0;
780  delete m_xmlnss;
781  m_xmlnss = 0;
782  m_cdata = EmptyString;
783  m_tag = EmptyString;
784  m_attrib = EmptyString;
785  m_attribPrefix = EmptyString;
786  m_tagPrefix = EmptyString;
787  m_haveAttribPrefix = false;
788  m_haveTagPrefix = false;
789  m_value = EmptyString;
790  m_xmlns = EmptyString;
791  util::clearList( m_attribs );
792  m_attribs.clear();
793  m_state = Initial;
794  m_preamble = 0;
795  }
796 
797  bool Parser::isWhitespace( unsigned char c )
798  {
799  return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
800  }
801 
802  void Parser::streamEvent( Tag* tag )
803  {
804  if( m_tagHandler )
805  m_tagHandler->handleTag( tag );
806  }
807 
808 }
Parser(TagHandler *ph, bool deleteRoot=true)
Definition: parser.cpp:22
void cleanup(bool deleteRoot=true)
Definition: parser.cpp:774
int feed(std::string &data)
Definition: parser.cpp:161
virtual ~Parser()
Definition: parser.cpp:29
A virtual interface which can be reimplemented to receive non-XMPP Core stanzas.
Definition: taghandler.h:33
virtual void handleTag(Tag *tag)=0
This is an abstraction of an XML element.
Definition: tag.h:47
bool setPrefix(const std::string &prefix)
Definition: tag.cpp:565
const std::string & prefix() const
Definition: tag.h:249
const std::string xmlns() const
Definition: tag.cpp:543
Tag * parent() const
Definition: tag.h:526
bool addCData(const std::string &cdata)
Definition: tag.cpp:481
const std::string & name() const
Definition: tag.h:394
void setAttributes(const AttributeList &attributes)
Definition: tag.cpp:409
bool setXmlns(const std::string &xmlns, const std::string &prefix=EmptyString)
Definition: tag.cpp:522
void clearList(std::list< T * > &L)
Definition: util.h:152
The namespace for the gloox library.
Definition: adhoc.cpp:28
const std::string XMLNS_STREAM
Definition: gloox.cpp:84
const std::string EmptyString
Definition: gloox.cpp:124
const std::string XMLNS
Definition: gloox.cpp:122
std::map< std::string, std::string > StringMap
Definition: gloox.h:1261