Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | Related Pages

parser.cpp

00001 /*
00002   Copyright (c) 2004-2008 by Jakob Schroeter <js@camaya.net>
00003   This file is part of the gloox library. http://camaya.net/gloox
00004 
00005   This software is distributed under a license. The full license
00006   agreement can be found in the file LICENSE in this distribution.
00007   This software may not be copied, modified, sold or distributed
00008   other than expressed in the named license agreement.
00009 
00010   This software is distributed without any warranty.
00011 */
00012 
00013 #include "gloox.h"
00014 #include "util.h"
00015 #include "parser.h"
00016 
00017 #include <cstdlib>
00018 
00019 namespace gloox
00020 {
00021 
00022   Parser::Parser( TagHandler* ph )
00023     : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
00024       m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
00025       m_attribIsXmlns( false )
00026   {
00027   }
00028 
00029   Parser::~Parser()
00030   {
00031     delete m_root;
00032     delete m_xmlnss;
00033   }
00034 
00035   Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
00036   {
00037     std::string::size_type p = data.find( ';', pos );
00038     std::string::size_type diff = p - pos;
00039 
00040     if( p == std::string::npos )
00041     {
00042       m_backBuffer = data.substr( pos );
00043       return DecodeInsufficient;
00044     }
00045 
00046     if( diff < 3 || diff > 9 )
00047       return DecodeInvalid;
00048 
00049     std::string rep;
00050     switch( data[pos + 1] )
00051     {
00052       case '#':
00053         {
00054           int base = 10;
00055           int idx = 2;
00056 
00057           if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
00058           {
00059             base = 16;
00060             idx = 3;
00061           }
00062 
00063           char* end;
00064           const long int val = std::strtol( data.data() + pos + idx, &end, base );
00065           if( *end != ';' || val < 0 )
00066             return DecodeInvalid;
00067 
00068           if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
00069           {
00070             rep += char( val );
00071           }
00072           else if( val >= 0x80 && val <= 0x7FF )
00073           {
00074             rep += char( 192 + ( val >> 6 ) );
00075             rep += char( 128 + ( val % 64 ) );
00076           }
00077           else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
00078           {
00079             rep += char( 224 + ( val >> 12 ) );
00080             rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00081             rep += char( 128 + ( val % 64 ) );
00082           }
00083           else if( val >= 0x100000 && val < 0x10FFFF )
00084           {
00085             rep += char( 240 + ( val >> 18 ) );
00086             rep += char( 128 + ( ( val >> 12 ) % 64 ) );
00087             rep += char( 128 + ( ( val >> 6 ) % 64 ) );
00088             rep += char( 128 + ( val % 64 ) );
00089           }
00090           else
00091             return DecodeInvalid;
00092         }
00093         break;
00094       case 'l':
00095         if( diff == 3 && data[pos + 2] == 't' )
00096           rep += '<';
00097         else
00098           return DecodeInvalid;
00099         break;
00100       case 'g':
00101         if( diff == 3 && data[pos + 2] == 't' )
00102           rep += '>';
00103         else
00104           return DecodeInvalid;
00105         break;
00106       case 'a':
00107         if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
00108           rep += '\'';
00109         else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
00110           rep += '&';
00111         else
00112           return DecodeInvalid;
00113         break;
00114       case 'q':
00115         if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
00116           rep += '"';
00117         else
00118           return DecodeInvalid;
00119         break;
00120       default:
00121         return DecodeInvalid;
00122     }
00123 
00124     switch( m_state )
00125     {
00126       case TagInside:
00127         m_cdata += rep;
00128         break;
00129       case TagAttributeValue:
00130         m_value += rep;
00131         break;
00132       default:
00133         break;
00134     }
00135     pos += diff;
00136     return DecodeValid;
00137   }
00138 
00139   Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
00140                                                 const std::string& needle )
00141   {
00142     if( pos + needle.length() <= data.length() )
00143     {
00144       if( !data.compare( pos, needle.length(), needle ) )
00145       {
00146         pos += needle.length() - 1;
00147         return ForwardFound;
00148       }
00149       else
00150       {
00151         return ForwardNotFound;
00152       }
00153     }
00154     else
00155     {
00156       m_backBuffer = data.substr( pos );
00157       return ForwardInsufficientSize;
00158     }
00159   }
00160 
00161   int Parser::feed( std::string& data )
00162   {
00163     if( !m_backBuffer.empty() )
00164     {
00165       data.insert( 0, m_backBuffer );
00166       m_backBuffer = EmptyString;
00167     }
00168 
00169     std::string::size_type count = data.length();
00170     for( std::string::size_type i = 0 ; i < count; ++i )
00171     {
00172       const unsigned char c = data[i];
00173 //       printf( "found char:   %c, ", c );
00174 
00175       if( !isValid( c ) )
00176       {
00177         cleanup();
00178         return i;
00179       }
00180 
00181       switch( m_state )
00182       {
00183         case Initial:
00184 //           printf( "Initial: %c\n", c );
00185           m_tag = EmptyString;
00186           if( isWhitespace( c ) )
00187             break;
00188 
00189           switch( c )
00190           {
00191             case '<':
00192               m_state = TagOpening;
00193               break;
00194             case '>':
00195             default:
00196               if( m_current )
00197               {
00198                 m_cdata += c;
00199                 m_state = TagInside;
00200               }
00201               break;
00202           }
00203           break;
00204         case TagOpening:               // opening '<' has been found before
00205 //           printf( "TagOpening: %c\n", c );
00206           if( isWhitespace( c ) )
00207             break;
00208 
00209           switch( c )
00210           {
00211             case '<':
00212             case '>':
00213             case '&':
00214               cleanup();
00215               return i;
00216               break;
00217             case '/':
00218               m_state = TagClosingSlash;
00219               break;
00220             case '?':
00221               m_state = TagNameCollect;
00222               m_preamble = 1;
00223               break;
00224             case '!':
00225               switch( forwardScan( i, data, "![CDATA[" ) )
00226               {
00227                 case ForwardFound:
00228                   m_state = TagCDATASection;
00229                   break;
00230                 case ForwardNotFound:
00231                   cleanup();
00232                   return i;
00233                 case ForwardInsufficientSize:
00234                   return -1;
00235               }
00236               break;
00237             default:
00238               m_tag += c;
00239               m_state = TagNameCollect;
00240               break;
00241           }
00242           break;
00243         case TagCDATASection:
00244           switch( c )
00245           {
00246             case ']':
00247               switch( forwardScan( i, data, "]]>" ) )
00248               {
00249                 case ForwardFound:
00250                   m_state = TagInside;
00251                   break;
00252                 case ForwardNotFound:
00253                   m_cdata += c;
00254                   break;
00255                 case ForwardInsufficientSize:
00256                   return -1;
00257               }
00258               break;
00259             default:
00260               m_cdata += c;
00261               break;
00262           }
00263           break;
00264         case TagNameCollect:          // we're collecting the tag's name, we have at least one octet already
00265 //           printf( "TagNameCollect: %c\n", c );
00266           if( isWhitespace( c ) )
00267           {
00268             m_state = TagNameComplete;
00269             break;
00270           }
00271 
00272           switch( c )
00273           {
00274             case '<':
00275             case '?':
00276             case '!':
00277             case '&':
00278               cleanup();
00279               return i;
00280               break;
00281             case '/':
00282               m_state = TagOpeningSlash;
00283               break;
00284             case '>':
00285               addTag();
00286               m_state = TagInside;
00287               break;
00288             case ':':
00289               if( !m_haveTagPrefix )
00290               {
00291                 m_haveTagPrefix = true;
00292                 m_tagPrefix = m_tag;
00293                 m_tag = EmptyString;
00294               }
00295               else
00296               {
00297                 cleanup();
00298                 return i;
00299               }
00300               break;
00301             default:
00302               m_tag += c;
00303               break;
00304           }
00305           break;
00306         case TagInside:                // we're inside a tag, expecting a child tag or cdata
00307 //           printf( "TagInside: %c\n", c );
00308           m_tag = EmptyString;
00309           switch( c )
00310           {
00311             case '<':
00312               addCData();
00313               m_state = TagOpening;
00314               break;
00315             case '&':
00316 //               printf( "TagInside, calling decode\n" );
00317               switch( decode( i, data ) )
00318               {
00319                 case DecodeValid:
00320                   break;
00321                 case DecodeInvalid:
00322                   cleanup();
00323                   return i;
00324                 case DecodeInsufficient:
00325                   return -1;
00326               }
00327               break;
00328             default:
00329               m_cdata += c;
00330               break;
00331           }
00332           break;
00333         case TagOpeningSlash:         // a slash in an opening tag has been found, initing close of the tag
00334 //           printf( "TagOpeningSlash: %c\n", c );
00335           if( isWhitespace( c ) )
00336             break;
00337 
00338           if( c == '>' )
00339           {
00340             addTag();
00341             if( !closeTag() )
00342             {
00343 //               printf( "noipe, here\n" );
00344               cleanup();
00345               return i;
00346             }
00347 
00348             m_state = Initial;
00349           }
00350           else
00351           {
00352             cleanup();
00353             return i;
00354           }
00355           break;
00356         case TagClosingSlash:         // we have found the '/' of a closing tag
00357 //           printf( "TagClosingSlash: %c\n", c );
00358           if( isWhitespace( c ) )
00359             break;
00360 
00361           switch( c )
00362           {
00363             case '>':
00364             case '<':
00365             case '/':
00366               cleanup();
00367               return i;
00368               break;
00369             default:
00370               m_tag += c;
00371               m_state = TagClosing;
00372               break;
00373           }
00374           break;
00375         case TagClosing:               // we're collecting the name of a closing tag
00376 //           printf( "TagClosing: %c\n", c );
00377           switch( c )
00378           {
00379             case '<':
00380             case '/':
00381             case '!':
00382             case '?':
00383             case '&':
00384               cleanup();
00385               return i;
00386               break;
00387             case ':':
00388               if( !m_haveTagPrefix )
00389               {
00390                 m_haveTagPrefix = true;
00391                 m_tagPrefix = m_tag;
00392                 m_tag = EmptyString;
00393               }
00394               else
00395               {
00396                 cleanup();
00397                 return i;
00398               }
00399               break;
00400             case '>':
00401               if( !closeTag() )
00402               {
00403 //                 printf( "here\n" );
00404                 cleanup();
00405                 return i;
00406               }
00407               m_state = Initial;
00408               break;
00409             default:
00410               m_tag += c;
00411               break;
00412           }
00413           break;
00414         case TagNameComplete:        // a tag name is complete, expect tag close or attribs
00415 //           printf( "TagNameComplete: %c\n", c );
00416           if( isWhitespace( c ) )
00417             break;
00418 
00419           switch( c )
00420           {
00421             case '<':
00422             case '!':
00423             case '&':
00424               cleanup();
00425               return i;
00426               break;
00427             case '/':
00428               m_state = TagOpeningSlash;
00429               break;
00430             case '>':
00431               if( m_preamble == 1 )
00432               {
00433                 cleanup();
00434                 return i;
00435               }
00436               m_state = TagInside;
00437               addTag();
00438               break;
00439             case '?':
00440               if( m_preamble == 1 )
00441                 m_preamble = 2;
00442               else
00443               {
00444                 cleanup();
00445                 return i;
00446               }
00447               break;
00448             default:
00449               m_attrib += c;
00450               m_state = TagAttribute;
00451               break;
00452           }
00453           break;
00454         case TagAttribute:                  // we're collecting the name of an attribute, we have at least 1 octet
00455 //           printf( "TagAttribute: %c\n", c );
00456           if( isWhitespace( c ) )
00457           {
00458             m_state = TagAttributeComplete;
00459             break;
00460           }
00461 
00462           switch( c )
00463           {
00464             case '<':
00465             case '/':
00466             case '>':
00467             case '?':
00468             case '!':
00469             case '&':
00470               cleanup();
00471               return i;
00472               break;
00473             case '=':
00474               m_state = TagAttributeEqual;
00475               break;
00476             case ':':
00477               if( !m_haveAttribPrefix && m_attrib != XMLNS )
00478               {
00479                 m_haveAttribPrefix = true;
00480                 m_attribPrefix = m_attrib;
00481                 m_attrib = EmptyString;
00482               }
00483               else if( m_attrib == XMLNS )
00484               {
00485                 m_attribIsXmlns = true;
00486                 m_attrib = EmptyString;
00487               }
00488               else
00489               {
00490                 cleanup();
00491                 return i;
00492               }
00493               break;
00494             default:
00495               m_attrib += c;
00496           }
00497           break;
00498         case TagAttributeComplete:         // we're expecting an equals sign or ws
00499 //           printf( "TagAttributeComplete: %c\n", c );
00500           if( isWhitespace( c ) )
00501             break;
00502 
00503           switch( c )
00504           {
00505             case '=':
00506               m_state = TagAttributeEqual;
00507               break;
00508             default:
00509               cleanup();
00510               return i;
00511               break;
00512           }
00513           break;
00514         case TagAttributeEqual:            // we have found an equals sign
00515 //           printf( "TagAttributeEqual: %c\n", c );
00516           if( isWhitespace( c ) )
00517             break;
00518 
00519           switch( c )
00520           {
00521             case '"':
00522               m_quote = true;
00523             case '\'':
00524               m_state = TagAttributeValue;
00525               break;
00526             default:
00527               cleanup();
00528               return i;
00529               break;
00530           }
00531           break;
00532         case TagAttributeValue:                 // we're expecting value data
00533 //           printf( "TagValue: %c\n", c );
00534           switch( c )
00535           {
00536             case '<':
00537               cleanup();
00538               return i;
00539               break;
00540             case '\'':
00541               if( m_quote )
00542               {
00543                 m_value += c;
00544                 break;
00545               }
00546             case '"':
00547               addAttribute();
00548               m_state = TagNameAlmostComplete;
00549               m_quote = false;
00550               break;
00551             case '&':
00552 //               printf( "TagAttributeValue, calling decode\n" );
00553               switch( decode( i, data ) )
00554               {
00555                 case DecodeValid:
00556                   break;
00557                 case DecodeInvalid:
00558                   cleanup();
00559                   return i;
00560                 case DecodeInsufficient:
00561                   return -1;
00562               }
00563               break;
00564             case '>':
00565             default:
00566               m_value += c;
00567           }
00568           break;
00569         case TagNameAlmostComplete:
00570 //           printf( "TagAttributeEqual: %c\n", c );
00571           if( isWhitespace( c ) )
00572           {
00573             m_state = TagNameComplete;
00574             break;
00575           }
00576 
00577           switch( c )
00578           {
00579             case '/':
00580               m_state = TagOpeningSlash;
00581               break;
00582             case '>':
00583               if( m_preamble == 1 )
00584               {
00585                 cleanup();
00586                 return i;
00587               }
00588               m_state = TagInside;
00589               addTag();
00590               break;
00591             case '?':
00592               if( m_preamble == 1 )
00593                 m_preamble = 2;
00594               else
00595               {
00596                 cleanup();
00597                 return i;
00598               }
00599               break;
00600             default:
00601               cleanup();
00602               return i;
00603               break;
00604           }
00605           break;
00606         default:
00607 //           printf( "default action!?\n" );
00608           break;
00609       }
00610 //       printf( "parser state: %d\n", m_state );
00611     }
00612 
00613     return -1;
00614   }
00615 
00616   void Parser::addTag()
00617   {
00618     if( !m_root )
00619     {
00620 //       printf( "created Tag named %s, ", m_tag.c_str() );
00621       m_root = new Tag( m_tag );
00622       m_current = m_root;
00623     }
00624     else
00625     {
00626 //       printf( "created Tag named %s, ", m_tag.c_str() );
00627       m_current = new Tag( m_current, m_tag );
00628     }
00629 
00630     if( m_haveTagPrefix )
00631     {
00632 //       printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
00633       m_current->setPrefix( m_tagPrefix );
00634       m_haveTagPrefix = false;
00635     }
00636 
00637     if( m_attribs.size() )
00638     {
00639       m_current->setAttributes( m_attribs );
00640 //       printf( "added %d attributes, ", m_attribs.size() );
00641       m_attribs.clear();
00642     }
00643 
00644     if( m_xmlnss )
00645     {
00646 //       printf( "have ns decls\n" );
00647 //       StringMap::const_iterator it = m_xmlnss->begin();
00648 //       for( ; it != m_xmlnss->end(); ++it )
00649 //         printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
00650       m_current->setXmlns( m_xmlnss );
00651       m_xmlnss = 0;
00652     }
00653 
00654     m_current->setXmlns( m_xmlns );
00655     m_xmlns = EmptyString;
00656 
00657     if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
00658     {
00659       streamEvent( m_root );
00660       cleanup();
00661       return;
00662     }
00663 //     else
00664 //       printf( "%s, ", m_root->xml().c_str() );
00665 
00666     if( m_root && m_root == m_current && m_tagPrefix == "stream" )
00667       m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
00668 
00669     if( m_tag == "xml" && m_preamble == 2 )
00670       cleanup();
00671   }
00672 
00673   void Parser::addAttribute()
00674   {
00675     Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
00676     if( m_attribIsXmlns )
00677     {
00678       if( !m_xmlnss )
00679         m_xmlnss = new StringMap();
00680 
00681       (*m_xmlnss)[m_attrib] = m_value;
00682       attr->setPrefix( XMLNS );
00683     }
00684     else
00685     {
00686 //   printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
00687       if( !m_attribPrefix.empty() )
00688         attr->setPrefix( m_attribPrefix );
00689       if( m_attrib == XMLNS )
00690         m_xmlns = m_value;
00691     }
00692     m_attribs.push_back( attr );
00693     m_attrib = EmptyString;
00694     m_value = EmptyString;
00695     m_attribPrefix = EmptyString;
00696     m_haveAttribPrefix = false;
00697     m_attribIsXmlns = false;
00698   }
00699 
00700   void Parser::addCData()
00701   {
00702     if( m_current && !m_cdata.empty() )
00703     {
00704       m_current->addCData( m_cdata );
00705 //       printf( "added cdata %s to %s: %s\n",
00706 //               m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
00707       m_cdata = EmptyString;
00708     }
00709   }
00710 
00711   bool Parser::closeTag()
00712   {
00713 //     printf( "about to close, " );
00714 
00715     if( m_tag == "stream" && m_tagPrefix == "stream" )
00716       return true;
00717 
00718     if( !m_current || m_current->name() != m_tag
00719         || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
00720     {
00721 //       printf( "current xml: %s\n", m_current->xml().c_str() );
00722 //       printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
00723 //       printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
00724       return false;
00725     }
00726 
00727 //       printf( "m_current: %s, ", m_current->name().c_str() );
00728 //       printf( "m_tag: %s, ", m_tag.c_str() );
00729 
00730     m_tagPrefix = EmptyString;
00731     m_haveTagPrefix = false;
00732 
00733     if( m_current->parent() )
00734       m_current = m_current->parent();
00735     else
00736     {
00737 //       printf( "pushing upstream\n" );
00738       streamEvent( m_root );
00739       cleanup();
00740     }
00741 
00742     return true;
00743   }
00744 
00745   void Parser::cleanup()
00746   {
00747     delete m_root;
00748     m_root = 0;
00749     m_current = 0;
00750     delete m_xmlnss;
00751     m_xmlnss = 0;
00752     m_cdata = EmptyString;
00753     m_tag = EmptyString;
00754     m_attrib = EmptyString;
00755     m_attribPrefix = EmptyString;
00756     m_tagPrefix = EmptyString;
00757     m_haveAttribPrefix = false;
00758     m_haveTagPrefix = false;
00759     m_value = EmptyString;
00760     m_xmlns = EmptyString;
00761 //     util::clearList( m_attribs );
00762     // FIXME
00763     Tag::AttributeList::iterator it = m_attribs.begin();
00764     Tag::AttributeList::iterator it2;
00765     while( it != m_attribs.end() )
00766     {
00767       it2 = it++;
00768       delete (*it2);
00769       m_attribs.erase( it2 );
00770     }
00771     // ~
00772     m_attribs.clear();
00773     m_state = Initial;
00774     m_preamble = 0;
00775   }
00776 
00777   bool Parser::isValid( unsigned char c )
00778   {
00779     return ( c != 0xc0 || c != 0xc1 || c < 0xf5 );
00780   }
00781 
00782   bool Parser::isWhitespace( unsigned char c )
00783   {
00784     return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
00785   }
00786 
00787   void Parser::streamEvent( Tag* tag )
00788   {
00789     if( m_tagHandler )
00790       m_tagHandler->handleTag( tag );
00791   }
00792 
00793 }

Generated on Mon Sep 1 09:25:10 2008 for gloox by  doxygen 1.4.1