00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include "gloox.h"
00014 #include "util.h"
00015 #include "parser.h"
00016
00017 #include <cstdlib>
00018
00019 namespace gloox
00020 {
00021
00022 Parser::Parser( TagHandler* ph )
00023 : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
00024 m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
00025 m_attribIsXmlns( false )
00026 {
00027 }
00028
00029 Parser::~Parser()
00030 {
00031 delete m_root;
00032 delete m_xmlnss;
00033 }
00034
00035 Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
00036 {
00037 std::string::size_type p = data.find( ';', pos );
00038 std::string::size_type diff = p - pos;
00039
00040 if( p == std::string::npos )
00041 {
00042 m_backBuffer = data.substr( pos );
00043 return DecodeInsufficient;
00044 }
00045
00046 if( diff < 3 || diff > 9 )
00047 return DecodeInvalid;
00048
00049 std::string rep;
00050 switch( data[pos + 1] )
00051 {
00052 case '#':
00053 {
00054 int base = 10;
00055 int idx = 2;
00056
00057 if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
00058 {
00059 base = 16;
00060 idx = 3;
00061 }
00062
00063 char* end;
00064 const long int val = std::strtol( data.data() + pos + idx, &end, base );
00065 if( *end != ';' || val < 0 )
00066 return DecodeInvalid;
00067
00068 if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
00069 {
00070 rep += char( val );
00071 }
00072 else if( val >= 0x80 && val <= 0x7FF )
00073 {
00074 rep += char( 192 + ( val >> 6 ) );
00075 rep += char( 128 + ( val % 64 ) );
00076 }
00077 else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
00078 {
00079 rep += char( 224 + ( val >> 12 ) );
00080 rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00081 rep += char( 128 + ( val % 64 ) );
00082 }
00083 else if( val >= 0x100000 && val < 0x10FFFF )
00084 {
00085 rep += char( 240 + ( val >> 18 ) );
00086 rep += char( 128 + ( ( val >> 12 ) % 64 ) );
00087 rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00088 rep += char( 128 + ( val % 64 ) );
00089 }
00090 else
00091 return DecodeInvalid;
00092 }
00093 break;
00094 case 'l':
00095 if( diff == 3 && data[pos + 2] == 't' )
00096 rep += '<';
00097 else
00098 return DecodeInvalid;
00099 break;
00100 case 'g':
00101 if( diff == 3 && data[pos + 2] == 't' )
00102 rep += '>';
00103 else
00104 return DecodeInvalid;
00105 break;
00106 case 'a':
00107 if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
00108 rep += '\'';
00109 else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
00110 rep += '&';
00111 else
00112 return DecodeInvalid;
00113 break;
00114 case 'q':
00115 if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
00116 rep += '"';
00117 else
00118 return DecodeInvalid;
00119 break;
00120 default:
00121 return DecodeInvalid;
00122 }
00123
00124 switch( m_state )
00125 {
00126 case TagInside:
00127 m_cdata += rep;
00128 break;
00129 case TagAttributeValue:
00130 m_value += rep;
00131 break;
00132 default:
00133 break;
00134 }
00135 pos += diff;
00136 return DecodeValid;
00137 }
00138
00139 Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
00140 const std::string& needle )
00141 {
00142 if( pos + needle.length() <= data.length() )
00143 {
00144 if( !data.compare( pos, needle.length(), needle ) )
00145 {
00146 pos += needle.length() - 1;
00147 return ForwardFound;
00148 }
00149 else
00150 {
00151 return ForwardNotFound;
00152 }
00153 }
00154 else
00155 {
00156 m_backBuffer = data.substr( pos );
00157 return ForwardInsufficientSize;
00158 }
00159 }
00160
00161 int Parser::feed( std::string& data )
00162 {
00163 if( !m_backBuffer.empty() )
00164 {
00165 data.insert( 0, m_backBuffer );
00166 m_backBuffer = EmptyString;
00167 }
00168
00169 std::string::size_type count = data.length();
00170 for( std::string::size_type i = 0 ; i < count; ++i )
00171 {
00172 const unsigned char c = data[i];
00173
00174
00175 if( !isValid( c ) )
00176 {
00177 cleanup();
00178 return i;
00179 }
00180
00181 switch( m_state )
00182 {
00183 case Initial:
00184
00185 m_tag = EmptyString;
00186 if( isWhitespace( c ) )
00187 break;
00188
00189 switch( c )
00190 {
00191 case '<':
00192 m_state = TagOpening;
00193 break;
00194 case '>':
00195 default:
00196 if( m_current )
00197 {
00198 m_cdata += c;
00199 m_state = TagInside;
00200 }
00201 break;
00202 }
00203 break;
00204 case TagOpening:
00205
00206 if( isWhitespace( c ) )
00207 break;
00208
00209 switch( c )
00210 {
00211 case '<':
00212 case '>':
00213 case '&':
00214 cleanup();
00215 return i;
00216 break;
00217 case '/':
00218 m_state = TagClosingSlash;
00219 break;
00220 case '?':
00221 m_state = TagNameCollect;
00222 m_preamble = 1;
00223 break;
00224 case '!':
00225 switch( forwardScan( i, data, "![CDATA[" ) )
00226 {
00227 case ForwardFound:
00228 m_state = TagCDATASection;
00229 break;
00230 case ForwardNotFound:
00231 cleanup();
00232 return i;
00233 case ForwardInsufficientSize:
00234 return -1;
00235 }
00236 break;
00237 default:
00238 m_tag += c;
00239 m_state = TagNameCollect;
00240 break;
00241 }
00242 break;
00243 case TagCDATASection:
00244 switch( c )
00245 {
00246 case ']':
00247 switch( forwardScan( i, data, "]]>" ) )
00248 {
00249 case ForwardFound:
00250 m_state = TagInside;
00251 break;
00252 case ForwardNotFound:
00253 m_cdata += c;
00254 break;
00255 case ForwardInsufficientSize:
00256 return -1;
00257 }
00258 break;
00259 default:
00260 m_cdata += c;
00261 break;
00262 }
00263 break;
00264 case TagNameCollect:
00265
00266 if( isWhitespace( c ) )
00267 {
00268 m_state = TagNameComplete;
00269 break;
00270 }
00271
00272 switch( c )
00273 {
00274 case '<':
00275 case '?':
00276 case '!':
00277 case '&':
00278 cleanup();
00279 return i;
00280 break;
00281 case '/':
00282 m_state = TagOpeningSlash;
00283 break;
00284 case '>':
00285 addTag();
00286 m_state = TagInside;
00287 break;
00288 case ':':
00289 if( !m_haveTagPrefix )
00290 {
00291 m_haveTagPrefix = true;
00292 m_tagPrefix = m_tag;
00293 m_tag = EmptyString;
00294 }
00295 else
00296 {
00297 cleanup();
00298 return i;
00299 }
00300 break;
00301 default:
00302 m_tag += c;
00303 break;
00304 }
00305 break;
00306 case TagInside:
00307
00308 m_tag = EmptyString;
00309 switch( c )
00310 {
00311 case '<':
00312 addCData();
00313 m_state = TagOpening;
00314 break;
00315 case '&':
00316
00317 switch( decode( i, data ) )
00318 {
00319 case DecodeValid:
00320 break;
00321 case DecodeInvalid:
00322 cleanup();
00323 return i;
00324 case DecodeInsufficient:
00325 return -1;
00326 }
00327 break;
00328 default:
00329 m_cdata += c;
00330 break;
00331 }
00332 break;
00333 case TagOpeningSlash:
00334
00335 if( isWhitespace( c ) )
00336 break;
00337
00338 if( c == '>' )
00339 {
00340 addTag();
00341 if( !closeTag() )
00342 {
00343
00344 cleanup();
00345 return i;
00346 }
00347
00348 m_state = Initial;
00349 }
00350 else
00351 {
00352 cleanup();
00353 return i;
00354 }
00355 break;
00356 case TagClosingSlash:
00357
00358 if( isWhitespace( c ) )
00359 break;
00360
00361 switch( c )
00362 {
00363 case '>':
00364 case '<':
00365 case '/':
00366 cleanup();
00367 return i;
00368 break;
00369 default:
00370 m_tag += c;
00371 m_state = TagClosing;
00372 break;
00373 }
00374 break;
00375 case TagClosing:
00376
00377 switch( c )
00378 {
00379 case '<':
00380 case '/':
00381 case '!':
00382 case '?':
00383 case '&':
00384 cleanup();
00385 return i;
00386 break;
00387 case ':':
00388 if( !m_haveTagPrefix )
00389 {
00390 m_haveTagPrefix = true;
00391 m_tagPrefix = m_tag;
00392 m_tag = EmptyString;
00393 }
00394 else
00395 {
00396 cleanup();
00397 return i;
00398 }
00399 break;
00400 case '>':
00401 if( !closeTag() )
00402 {
00403
00404 cleanup();
00405 return i;
00406 }
00407 m_state = Initial;
00408 break;
00409 default:
00410 m_tag += c;
00411 break;
00412 }
00413 break;
00414 case TagNameComplete:
00415
00416 if( isWhitespace( c ) )
00417 break;
00418
00419 switch( c )
00420 {
00421 case '<':
00422 case '!':
00423 case '&':
00424 cleanup();
00425 return i;
00426 break;
00427 case '/':
00428 m_state = TagOpeningSlash;
00429 break;
00430 case '>':
00431 if( m_preamble == 1 )
00432 {
00433 cleanup();
00434 return i;
00435 }
00436 m_state = TagInside;
00437 addTag();
00438 break;
00439 case '?':
00440 if( m_preamble == 1 )
00441 m_preamble = 2;
00442 else
00443 {
00444 cleanup();
00445 return i;
00446 }
00447 break;
00448 default:
00449 m_attrib += c;
00450 m_state = TagAttribute;
00451 break;
00452 }
00453 break;
00454 case TagAttribute:
00455
00456 if( isWhitespace( c ) )
00457 {
00458 m_state = TagAttributeComplete;
00459 break;
00460 }
00461
00462 switch( c )
00463 {
00464 case '<':
00465 case '/':
00466 case '>':
00467 case '?':
00468 case '!':
00469 case '&':
00470 cleanup();
00471 return i;
00472 break;
00473 case '=':
00474 m_state = TagAttributeEqual;
00475 break;
00476 case ':':
00477 if( !m_haveAttribPrefix && m_attrib != XMLNS )
00478 {
00479 m_haveAttribPrefix = true;
00480 m_attribPrefix = m_attrib;
00481 m_attrib = EmptyString;
00482 }
00483 else if( m_attrib == XMLNS )
00484 {
00485 m_attribIsXmlns = true;
00486 m_attrib = EmptyString;
00487 }
00488 else
00489 {
00490 cleanup();
00491 return i;
00492 }
00493 break;
00494 default:
00495 m_attrib += c;
00496 }
00497 break;
00498 case TagAttributeComplete:
00499
00500 if( isWhitespace( c ) )
00501 break;
00502
00503 switch( c )
00504 {
00505 case '=':
00506 m_state = TagAttributeEqual;
00507 break;
00508 default:
00509 cleanup();
00510 return i;
00511 break;
00512 }
00513 break;
00514 case TagAttributeEqual:
00515
00516 if( isWhitespace( c ) )
00517 break;
00518
00519 switch( c )
00520 {
00521 case '"':
00522 m_quote = true;
00523 case '\'':
00524 m_state = TagAttributeValue;
00525 break;
00526 default:
00527 cleanup();
00528 return i;
00529 break;
00530 }
00531 break;
00532 case TagAttributeValue:
00533
00534 switch( c )
00535 {
00536 case '<':
00537 cleanup();
00538 return i;
00539 break;
00540 case '\'':
00541 if( m_quote )
00542 {
00543 m_value += c;
00544 break;
00545 }
00546 case '"':
00547 addAttribute();
00548 m_state = TagNameAlmostComplete;
00549 m_quote = false;
00550 break;
00551 case '&':
00552
00553 switch( decode( i, data ) )
00554 {
00555 case DecodeValid:
00556 break;
00557 case DecodeInvalid:
00558 cleanup();
00559 return i;
00560 case DecodeInsufficient:
00561 return -1;
00562 }
00563 break;
00564 case '>':
00565 default:
00566 m_value += c;
00567 }
00568 break;
00569 case TagNameAlmostComplete:
00570
00571 if( isWhitespace( c ) )
00572 {
00573 m_state = TagNameComplete;
00574 break;
00575 }
00576
00577 switch( c )
00578 {
00579 case '/':
00580 m_state = TagOpeningSlash;
00581 break;
00582 case '>':
00583 if( m_preamble == 1 )
00584 {
00585 cleanup();
00586 return i;
00587 }
00588 m_state = TagInside;
00589 addTag();
00590 break;
00591 case '?':
00592 if( m_preamble == 1 )
00593 m_preamble = 2;
00594 else
00595 {
00596 cleanup();
00597 return i;
00598 }
00599 break;
00600 default:
00601 cleanup();
00602 return i;
00603 break;
00604 }
00605 break;
00606 default:
00607
00608 break;
00609 }
00610
00611 }
00612
00613 return -1;
00614 }
00615
00616 void Parser::addTag()
00617 {
00618 if( !m_root )
00619 {
00620
00621 m_root = new Tag( m_tag );
00622 m_current = m_root;
00623 }
00624 else
00625 {
00626
00627 m_current = new Tag( m_current, m_tag );
00628 }
00629
00630 if( m_haveTagPrefix )
00631 {
00632
00633 m_current->setPrefix( m_tagPrefix );
00634 m_haveTagPrefix = false;
00635 }
00636
00637 if( m_attribs.size() )
00638 {
00639 m_current->setAttributes( m_attribs );
00640
00641 m_attribs.clear();
00642 }
00643
00644 if( m_xmlnss )
00645 {
00646
00647
00648
00649
00650 m_current->setXmlns( m_xmlnss );
00651 m_xmlnss = 0;
00652 }
00653
00654 m_current->setXmlns( m_xmlns );
00655 m_xmlns = EmptyString;
00656
00657 if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
00658 {
00659 streamEvent( m_root );
00660 cleanup();
00661 return;
00662 }
00663
00664
00665
00666 if( m_root && m_root == m_current && m_tagPrefix == "stream" )
00667 m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
00668
00669 if( m_tag == "xml" && m_preamble == 2 )
00670 cleanup();
00671 }
00672
00673 void Parser::addAttribute()
00674 {
00675 Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
00676 if( m_attribIsXmlns )
00677 {
00678 if( !m_xmlnss )
00679 m_xmlnss = new StringMap();
00680
00681 (*m_xmlnss)[m_attrib] = m_value;
00682 attr->setPrefix( XMLNS );
00683 }
00684 else
00685 {
00686
00687 if( !m_attribPrefix.empty() )
00688 attr->setPrefix( m_attribPrefix );
00689 if( m_attrib == XMLNS )
00690 m_xmlns = m_value;
00691 }
00692 m_attribs.push_back( attr );
00693 m_attrib = EmptyString;
00694 m_value = EmptyString;
00695 m_attribPrefix = EmptyString;
00696 m_haveAttribPrefix = false;
00697 m_attribIsXmlns = false;
00698 }
00699
00700 void Parser::addCData()
00701 {
00702 if( m_current && !m_cdata.empty() )
00703 {
00704 m_current->addCData( m_cdata );
00705
00706
00707 m_cdata = EmptyString;
00708 }
00709 }
00710
00711 bool Parser::closeTag()
00712 {
00713
00714
00715 if( m_tag == "stream" && m_tagPrefix == "stream" )
00716 return true;
00717
00718 if( !m_current || m_current->name() != m_tag
00719 || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
00720 {
00721
00722
00723
00724 return false;
00725 }
00726
00727
00728
00729
00730 m_tagPrefix = EmptyString;
00731 m_haveTagPrefix = false;
00732
00733 if( m_current->parent() )
00734 m_current = m_current->parent();
00735 else
00736 {
00737
00738 streamEvent( m_root );
00739 cleanup();
00740 }
00741
00742 return true;
00743 }
00744
00745 void Parser::cleanup()
00746 {
00747 delete m_root;
00748 m_root = 0;
00749 m_current = 0;
00750 delete m_xmlnss;
00751 m_xmlnss = 0;
00752 m_cdata = EmptyString;
00753 m_tag = EmptyString;
00754 m_attrib = EmptyString;
00755 m_attribPrefix = EmptyString;
00756 m_tagPrefix = EmptyString;
00757 m_haveAttribPrefix = false;
00758 m_haveTagPrefix = false;
00759 m_value = EmptyString;
00760 m_xmlns = EmptyString;
00761
00762
00763 Tag::AttributeList::iterator it = m_attribs.begin();
00764 Tag::AttributeList::iterator it2;
00765 while( it != m_attribs.end() )
00766 {
00767 it2 = it++;
00768 delete (*it2);
00769 m_attribs.erase( it2 );
00770 }
00771
00772 m_attribs.clear();
00773 m_state = Initial;
00774 m_preamble = 0;
00775 }
00776
00777 bool Parser::isValid( unsigned char c )
00778 {
00779 return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
00780 }
00781
00782 bool Parser::isWhitespace( unsigned char c )
00783 {
00784 return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
00785 }
00786
00787 void Parser::streamEvent( Tag* tag )
00788 {
00789 if( m_tagHandler )
00790 m_tagHandler->handleTag( tag );
00791 }
00792
00793 }