gloox  1.0.20
parser.cpp
1 /*
2  Copyright (c) 2004-2017 by Jakob Schröter <js@camaya.net>
3  This file is part of the gloox library. http://camaya.net/gloox
4 
5  This software is distributed under a license. The full license
6  agreement can be found in the file LICENSE in this distribution.
7  This software may not be copied, modified, sold or distributed
8  other than expressed in the named license agreement.
9 
10  This software is distributed without any warranty.
11 */
12 
13 #include "gloox.h"
14 #include "util.h"
15 #include "parser.h"
16 
17 #include <cstdlib>
18 
19 namespace gloox
20 {
21 
22  Parser::Parser( TagHandler* ph, bool deleteRoot )
23  : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
24  m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
25  m_attribIsXmlns( false ), m_deleteRoot( deleteRoot )
26  {
27  }
28 
30  {
31  delete m_root;
32  delete m_xmlnss;
33  }
34 
35  Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
36  {
37  std::string::size_type p = data.find( ';', pos );
38  std::string::size_type diff = p - pos;
39 
40  if( p == std::string::npos )
41  {
42  m_backBuffer = data.substr( pos );
43  return DecodeInsufficient;
44  }
45 
46  if( diff < 3 || diff > 9 )
47  return DecodeInvalid;
48 
49  std::string rep;
50  switch( data[pos + 1] )
51  {
52  case '#':
53  {
54  int base = 10;
55  int idx = 2;
56 
57  if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
58  {
59  base = 16;
60  idx = 3;
61  }
62 
63  char* end;
64  const long int val = std::strtol( data.data() + pos + idx, &end, base );
65  if( *end != ';' || val < 0 )
66  return DecodeInvalid;
67 
68  if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
69  {
70  rep += char( val );
71  }
72  else if( val >= 0x80 && val <= 0x7FF )
73  {
74  rep += char( 192 + ( val >> 6 ) );
75  rep += char( 128 + ( val % 64 ) );
76  }
77  else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
78  {
79  rep += char( 224 + ( val >> 12 ) );
80  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
81  rep += char( 128 + ( val % 64 ) );
82  }
83  else if( val >= 0x100000 && val < 0x10FFFF )
84  {
85  rep += char( 240 + ( val >> 18 ) );
86  rep += char( 128 + ( ( val >> 12 ) % 64 ) );
87  rep += char( 128 + ( ( val >> 6 ) % 64 ) );
88  rep += char( 128 + ( val % 64 ) );
89  }
90  else
91  return DecodeInvalid;
92  }
93  break;
94  case 'l':
95  if( diff == 3 && data[pos + 2] == 't' )
96  rep += '<';
97  else
98  return DecodeInvalid;
99  break;
100  case 'g':
101  if( diff == 3 && data[pos + 2] == 't' )
102  rep += '>';
103  else
104  return DecodeInvalid;
105  break;
106  case 'a':
107  if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
108  rep += '\'';
109  else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
110  rep += '&';
111  else
112  return DecodeInvalid;
113  break;
114  case 'q':
115  if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
116  rep += '"';
117  else
118  return DecodeInvalid;
119  break;
120  default:
121  return DecodeInvalid;
122  }
123 
124  switch( m_state )
125  {
126  case InterTag:
127  case TagInside:
128  m_cdata += rep;
129  break;
130  case TagAttributeValue:
131  m_value += rep;
132  break;
133  default:
134  break;
135  }
136  pos += diff;
137  return DecodeValid;
138  }
139 
140  Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
141  const std::string& needle )
142  {
143  if( pos + needle.length() <= data.length() )
144  {
145  if( !data.compare( pos, needle.length(), needle ) )
146  {
147  pos += needle.length() - 1;
148  return ForwardFound;
149  }
150  else
151  {
152  return ForwardNotFound;
153  }
154  }
155  else
156  {
157  m_backBuffer = data.substr( pos );
158  return ForwardInsufficientSize;
159  }
160  }
161 
162  int Parser::feed( std::string& data )
163  {
164  if( !m_backBuffer.empty() )
165  {
166  data.insert( 0, m_backBuffer );
167  m_backBuffer = EmptyString;
168  }
169 
170  std::string::size_type count = data.length();
171  for( std::string::size_type i = 0; i < count; ++i )
172  {
173  const unsigned char c = data[i];
174 // printf( "found char: %c, ", c );
175 
176  switch( m_state )
177  {
178  case Initial:
179 // printf( "Initial: %c\n", c );
180  if( isWhitespace( c ) )
181  break;
182 
183  switch( c )
184  {
185  case '<':
186  m_state = TagOpening;
187  break;
188  default:
189  cleanup();
190  return static_cast<int>( i );
191  break;
192  }
193  break;
194  case InterTag:
195 // printf( "InterTag: %c\n", c );
196  m_tag = EmptyString;
197  if( isWhitespace( c ) )
198  {
199  m_state = TagInside;
200  if( m_current )
201  m_cdata += c;
202  break;
203  }
204 
205  switch( c )
206  {
207  case '&':
208 // printf( "InterTag, calling decode\n" );
209  switch( decode( i, data ) )
210  {
211  case DecodeValid:
212  m_state = TagInside;
213  break;
214  case DecodeInvalid:
215  cleanup();
216  return static_cast<int>( i );
217  case DecodeInsufficient:
218  return -1;
219  }
220  break;
221  case '<':
222  m_state = TagOpening;
223  break;
224  case '>':
225  default:
226  if( m_current )
227  {
228  m_cdata += c;
229  m_state = TagInside;
230  }
231  break;
232  }
233  break;
234  case TagOpening: // opening '<' has been found before
235 // printf( "TagOpening: %c\n", c );
236  if( isWhitespace( c ) )
237  break;
238 
239  switch( c )
240  {
241  case '<':
242  case '>':
243  case '&':
244  cleanup();
245  return static_cast<int>( i );
246  break;
247  case '/':
248  m_state = TagClosingSlash;
249  break;
250  case '?':
251  m_state = TagNameCollect;
252  m_preamble = 1;
253  break;
254  case '!':
255  switch( forwardScan( i, data, "![CDATA[" ) )
256  {
257  case ForwardFound:
258  m_state = TagCDATASection;
259  break;
260  case ForwardNotFound:
261  cleanup();
262  return static_cast<int>( i );
263  case ForwardInsufficientSize:
264  return -1;
265  }
266  break;
267  default:
268  m_tag += c;
269  m_state = TagNameCollect;
270  break;
271  }
272  break;
273  case TagCDATASection:
274  switch( c )
275  {
276  case ']':
277  switch( forwardScan( i, data, "]]>" ) )
278  {
279  case ForwardFound:
280  m_state = TagInside;
281  break;
282  case ForwardNotFound:
283  m_cdata += c;
284  break;
285  case ForwardInsufficientSize:
286  return -1;
287  }
288  break;
289  default:
290  m_cdata += c;
291  break;
292  }
293  break;
294  case TagNameCollect: // we're collecting the tag's name, we have at least one octet already
295 // printf( "TagNameCollect: %c\n", c );
296  if( isWhitespace( c ) )
297  {
298  m_state = TagNameComplete;
299  break;
300  }
301 
302  switch( c )
303  {
304  case '<':
305  case '?':
306  case '!':
307  case '&':
308  cleanup();
309  return static_cast<int>( i );
310  break;
311  case '/':
312  m_state = TagOpeningSlash;
313  break;
314  case '>':
315  addTag();
316  m_state = TagInside;
317  break;
318  case ':':
319  if( !m_haveTagPrefix )
320  {
321  m_haveTagPrefix = true;
322  m_tagPrefix = m_tag;
323  m_tag = EmptyString;
324  }
325  else
326  {
327  cleanup();
328  return static_cast<int>( i );
329  }
330  break;
331  default:
332  m_tag += c;
333  break;
334  }
335  break;
336  case TagInside: // we're inside a tag, expecting a child tag or cdata
337 // printf( "TagInside: %c\n", c );
338  m_tag = EmptyString;
339  switch( c )
340  {
341  case '<':
342  addCData();
343  m_state = TagOpening;
344  break;
345  case '&':
346 // printf( "TagInside, calling decode\n" );
347  switch( decode( i, data ) )
348  {
349  case DecodeValid:
350  break;
351  case DecodeInvalid:
352  cleanup();
353  return static_cast<int>( i );
354  case DecodeInsufficient:
355  return -1;
356  }
357  break;
358  default:
359  m_cdata += c;
360  break;
361  }
362  break;
363  case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag
364 // printf( "TagOpeningSlash: %c\n", c );
365  if( isWhitespace( c ) )
366  break;
367 
368  if( c == '>' )
369  {
370  addTag();
371  if( !closeTag() )
372  {
373 // printf( "noipe, here\n" );
374  cleanup();
375  return static_cast<int>( i );
376  }
377 
378  m_state = InterTag;
379  }
380  else
381  {
382  cleanup();
383  return static_cast<int>( i );
384  }
385  break;
386  case TagClosingSlash: // we have found the '/' of a closing tag
387 // printf( "TagClosingSlash: %c\n", c );
388  if( isWhitespace( c ) )
389  break;
390 
391  switch( c )
392  {
393  case '>':
394  case '<':
395  case '/':
396  cleanup();
397  return static_cast<int>( i );
398  break;
399  default:
400  m_tag += c;
401  m_state = TagClosing;
402  break;
403  }
404  break;
405  case TagClosing: // we're collecting the name of a closing tag
406 // printf( "TagClosing: %c\n", c );
407  switch( c )
408  {
409  case '<':
410  case '/':
411  case '!':
412  case '?':
413  case '&':
414  cleanup();
415  return static_cast<int>( i );
416  break;
417  case ':':
418  if( !m_haveTagPrefix )
419  {
420  m_haveTagPrefix = true;
421  m_tagPrefix = m_tag;
422  m_tag = EmptyString;
423  }
424  else
425  {
426  cleanup();
427  return static_cast<int>( i );
428  }
429  break;
430  case '>':
431  if( !closeTag() )
432  {
433 // printf( "here\n" );
434  cleanup();
435  return static_cast<int>( i );
436  }
437  m_state = InterTag;
438  break;
439  default:
440  m_tag += c;
441  break;
442  }
443  break;
444  case TagNameComplete: // a tag name is complete, expect tag close or attribs
445 // printf( "TagNameComplete: %c\n", c );
446  if( isWhitespace( c ) )
447  break;
448 
449  switch( c )
450  {
451  case '<':
452  case '!':
453  case '&':
454  cleanup();
455  return static_cast<int>( i );
456  break;
457  case '/':
458  m_state = TagOpeningSlash;
459  break;
460  case '>':
461  if( m_preamble == 1 )
462  {
463  cleanup();
464  return static_cast<int>( i );
465  }
466  m_state = TagInside;
467  addTag();
468  break;
469  case '?':
470  if( m_preamble == 1 )
471  m_preamble = 2;
472  else
473  {
474  cleanup();
475  return static_cast<int>( i );
476  }
477  break;
478  default:
479  m_attrib += c;
480  m_state = TagAttribute;
481  break;
482  }
483  break;
484  case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet
485 // printf( "TagAttribute: %c\n", c );
486  if( isWhitespace( c ) )
487  {
488  m_state = TagAttributeComplete;
489  break;
490  }
491 
492  switch( c )
493  {
494  case '<':
495  case '/':
496  case '>':
497  case '?':
498  case '!':
499  case '&':
500  cleanup();
501  return static_cast<int>( i );
502  break;
503  case '=':
504  m_state = TagAttributeEqual;
505  break;
506  case ':':
507  if( !m_haveAttribPrefix && m_attrib != XMLNS )
508  {
509  m_haveAttribPrefix = true;
510  m_attribPrefix = m_attrib;
511  m_attrib = EmptyString;
512  }
513  else if( m_attrib == XMLNS )
514  {
515  m_attribIsXmlns = true;
516  m_attrib = EmptyString;
517  }
518  else
519  {
520  cleanup();
521  return static_cast<int>( i );
522  }
523  break;
524  default:
525  m_attrib += c;
526  }
527  break;
528  case TagAttributeComplete: // we're expecting an equals sign or ws
529 // printf( "TagAttributeComplete: %c\n", c );
530  if( isWhitespace( c ) )
531  break;
532 
533  switch( c )
534  {
535  case '=':
536  m_state = TagAttributeEqual;
537  break;
538  default:
539  cleanup();
540  return static_cast<int>( i );
541  break;
542  }
543  break;
544  case TagAttributeEqual: // we have found an equals sign
545 // printf( "TagAttributeEqual: %c\n", c );
546  if( isWhitespace( c ) )
547  break;
548 
549  switch( c )
550  {
551  case '"':
552  m_quote = true;
553  case '\'':
554  m_state = TagAttributeValue;
555  break;
556  default:
557  cleanup();
558  return static_cast<int>( i );
559  break;
560  }
561  break;
562  case TagAttributeValue: // we're expecting value data
563 // printf( "TagValue: %c\n", c );
564  switch( c )
565  {
566  case '<':
567  cleanup();
568  return static_cast<int>( i );
569  break;
570  case '\'':
571  if( m_quote )
572  {
573  m_value += c;
574  break;
575  }
576  case '"':
577  addAttribute();
578  m_state = TagNameAlmostComplete;
579  m_quote = false;
580  break;
581  case '&':
582 // printf( "TagAttributeValue, calling decode\n" );
583  switch( decode( i, data ) )
584  {
585  case DecodeValid:
586  break;
587  case DecodeInvalid:
588  cleanup();
589  return static_cast<int>( i );
590  case DecodeInsufficient:
591  return -1;
592  }
593  break;
594  case '>':
595  default:
596  m_value += c;
597  }
598  break;
599  case TagNameAlmostComplete:
600 // printf( "TagAttributeEqual: %c\n", c );
601  if( isWhitespace( c ) )
602  {
603  m_state = TagNameComplete;
604  break;
605  }
606 
607  switch( c )
608  {
609  case '/':
610  m_state = TagOpeningSlash;
611  break;
612  case '>':
613  if( m_preamble == 1 )
614  {
615  cleanup();
616  return static_cast<int>( i );
617  }
618  m_state = TagInside;
619  addTag();
620  break;
621  case '?':
622  if( m_preamble == 1 )
623  m_preamble = 2;
624  else
625  {
626  cleanup();
627  return static_cast<int>( i );
628  }
629  break;
630  default:
631  cleanup();
632  return static_cast<int>( i );
633  break;
634  }
635  break;
636  default:
637 // printf( "default action!?\n" );
638  break;
639  }
640 // printf( "parser state: %d\n", m_state );
641  }
642 
643  return -1;
644  }
645 
646  void Parser::addTag()
647  {
648  if( !m_root )
649  {
650 // printf( "created Tag named %s, ", m_tag.c_str() );
651  m_root = new Tag( m_tag );
652  m_current = m_root;
653  }
654  else
655  {
656 // printf( "created Tag named %s, ", m_tag.c_str() );
657  m_current = new Tag( m_current, m_tag );
658  }
659 
660  if( m_haveTagPrefix )
661  {
662 // printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
663  m_current->setPrefix( m_tagPrefix );
664  m_haveTagPrefix = false;
665  }
666 
667  if( m_attribs.size() )
668  {
669  m_current->setAttributes( m_attribs );
670 // printf( "added %d attributes, ", m_attribs.size() );
671  m_attribs.clear();
672  }
673 
674  if( m_xmlnss )
675  {
676 // printf( "have ns decls\n" );
677 // StringMap::const_iterator it = m_xmlnss->begin();
678 // for( ; it != m_xmlnss->end(); ++it )
679 // printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
680  m_current->setXmlns( m_xmlnss );
681  m_xmlnss = 0;
682  }
683 
684  m_current->setXmlns( m_xmlns );
685  m_xmlns = EmptyString;
686 
687  if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
688  {
689  streamEvent( m_root );
690  cleanup( m_deleteRoot );
691  return;
692  }
693 // else
694 // printf( "%s, ", m_root->xml().c_str() );
695 
696  if( m_root && m_root == m_current && m_tagPrefix == "stream" )
697  m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
698 
699  if( m_tag == "xml" && m_preamble == 2 )
700  cleanup();
701  }
702 
703  void Parser::addAttribute()
704  {
705  Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
706  if( m_attribIsXmlns )
707  {
708  if( !m_xmlnss )
709  m_xmlnss = new StringMap();
710 
711  (*m_xmlnss)[m_attrib] = m_value;
712  attr->setPrefix( XMLNS );
713  }
714  else
715  {
716 // printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
717  if( !m_attribPrefix.empty() )
718  attr->setPrefix( m_attribPrefix );
719  if( m_attrib == XMLNS )
720  m_xmlns = m_value;
721  }
722  m_attribs.push_back( attr );
723  m_attrib = EmptyString;
724  m_value = EmptyString;
725  m_attribPrefix = EmptyString;
726  m_haveAttribPrefix = false;
727  m_attribIsXmlns = false;
728  }
729 
730  void Parser::addCData()
731  {
732  if( m_current && !m_cdata.empty() )
733  {
734  m_current->addCData( m_cdata );
735 // printf( "added cdata %s to %s: %s\n",
736 // m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
737  m_cdata = EmptyString;
738  }
739  }
740 
741  bool Parser::closeTag()
742  {
743 // printf( "about to close, " );
744 
745  if( m_tag == "stream" && m_tagPrefix == "stream" )
746  return true;
747 
748  if( !m_current || m_current->name() != m_tag
749  || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
750  {
751 // printf( "current xml: %s\n", m_current->xml().c_str() );
752 // printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
753 // printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
754  return false;
755  }
756 
757 // printf( "m_current: %s, ", m_current->name().c_str() );
758 // printf( "m_tag: %s, ", m_tag.c_str() );
759 
760  m_tagPrefix = EmptyString;
761  m_haveTagPrefix = false;
762 
763  if( m_current->parent() )
764  m_current = m_current->parent();
765  else
766  {
767 // printf( "pushing upstream\n" );
768  streamEvent( m_root );
769  cleanup( m_deleteRoot );
770  }
771 
772  return true;
773  }
774 
775  void Parser::cleanup( bool deleteRoot )
776  {
777  if( deleteRoot )
778  delete m_root;
779  m_root = 0;
780  m_current = 0;
781  delete m_xmlnss;
782  m_xmlnss = 0;
783  m_cdata = EmptyString;
784  m_tag = EmptyString;
785  m_attrib = EmptyString;
786  m_attribPrefix = EmptyString;
787  m_tagPrefix = EmptyString;
788  m_haveAttribPrefix = false;
789  m_haveTagPrefix = false;
790  m_value = EmptyString;
791  m_xmlns = EmptyString;
792  util::clearList( m_attribs );
793  m_attribs.clear();
794  m_state = Initial;
795  m_preamble = 0;
796  }
797 
798  bool Parser::isWhitespace( unsigned char c )
799  {
800  return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
801  }
802 
803  void Parser::streamEvent( Tag* tag )
804  {
805  if( m_tagHandler )
806  m_tagHandler->handleTag( tag );
807  }
808 
809 }
bool setXmlns(const std::string &xmlns, const std::string &prefix=EmptyString)
Definition: tag.cpp:522
const std::string XMLNS
Definition: gloox.cpp:122
const std::string xmlns() const
Definition: tag.cpp:543
void clearList(std::list< T * > &L)
Definition: util.h:152
bool setPrefix(const std::string &prefix)
Definition: tag.cpp:565
Parser(TagHandler *ph, bool deleteRoot=true)
Definition: parser.cpp:22
void setAttributes(const AttributeList &attributes)
Definition: tag.cpp:409
bool setPrefix(const std::string &prefix)
Definition: tag.cpp:86
Tag * parent() const
Definition: tag.h:526
const std::string & name() const
Definition: tag.h:394
virtual ~Parser()
Definition: parser.cpp:29
const std::string & prefix() const
Definition: tag.h:249
The namespace for the gloox library.
Definition: adhoc.cpp:27
A virtual interface which can be reimplemented to receive non-XMPP Core stanzas.
Definition: taghandler.h:32
bool addCData(const std::string &cdata)
Definition: tag.cpp:481
std::map< std::string, std::string > StringMap
Definition: gloox.h:1261
virtual void handleTag(Tag *tag)=0
void cleanup(bool deleteRoot=true)
Definition: parser.cpp:775
const std::string XMLNS_STREAM
Definition: gloox.cpp:84
const std::string EmptyString
Definition: gloox.cpp:124
This is an abstraction of an XML element.
Definition: tag.h:46
int feed(std::string &data)
Definition: parser.cpp:162