// Copyright 1999, 2002 Robert Buff

// Contact: http://robertbuff.com/uvm

//

// This file is part of Mtg-Book.

//

// Mtg-Book is free software; you can redistribute it and/or modify

// it under the terms of the GNU General Public License as published

// by the Free Software Foundation; either version 2 of the License,

// or (at your option) any later version.

//

// Mtg-Book is distributed in the hope that it will be useful,

// but WITHOUT ANY WARRANTY; without even the implied warranty of

// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

// GNU General Public License for more details.

//

// You should have received a copy of the GNU General Public License

// along with Mtg-Book; if not, write to the 

//

// Free Software Foundation, Inc.

// 59 Temple Place, Suite 330

// Boston, MA 02111-1307

// USA



#include "MtgIncl.h"

#include "MtgHtmlDoc.h"



MTG_BEGIN_NAMESPACE





//

//   i n i t

//



void tHtmlDoc::init()



{

    m_sSource = 0;

    m_sLiteral = 0;

    m_nLitSize = 0;



    reset();

}





//

//   c l e a n u p

//



void tHtmlDoc::cleanup()



{

    if( m_sLiteral != 0 ) {

        delete m_sLiteral;

        m_sLiteral = 0;

        m_nLitSize = 0;

    }



    if( m_sSource != 0 ) {

        delete m_sSource;

        m_sSource = 0;

    }

}





//

//   e o s

//



bool tHtmlDoc::eos() const



{

    MTG_ASSERT( ! m_sSource || m_sCurrent );

    return ! m_sSource || ! *m_sCurrent;

}





//

//   s k i p S p a c e

//



void tHtmlDoc::skipSpace()



{

    while( isspace( *m_sCurrent ) )

        ++m_sCurrent;

}





//

//   s k i p C o m m e n t

//



void tHtmlDoc::skipComment()



{

    MTG_ASSERT( *m_sCurrent == '<' && m_sCurrent[1] == '!' );



        // A comment is a sequence of "-- .. --" constructs, separated

        // by whitespace and enclosed in "<!" and ">". The first "--"

        // must come immediately after the "<!", but we are more lenient

        // and admit white space characters.



    m_sCurrent += 2;

    skipSpace();



        // find "-- .. --" comments

    while( *m_sCurrent == '-' && m_sCurrent[1] == '-' ) {

        m_sCurrent += 2;

        while( *m_sCurrent && ( *m_sCurrent != '-' || m_sCurrent[1] != '-' ) )

            ++m_sCurrent;

        if( *m_sCurrent )

            m_sCurrent += 2;

        skipSpace();

    }



        // Now we should see a ">"; if we don't see it, we simply go

        // for the next one we can find (browsers are encouraged to

        // use simple heuristics to fix bad html source).

    

    while( *m_sCurrent && *m_sCurrent != '>' )

        ++m_sCurrent;



        // Skip the ">".

    if( *m_sCurrent )

        ++m_sCurrent;

}





//

//   s k i p D a t a

//



void tHtmlDoc::skipData()



{

    while( *m_sCurrent ) {

        if( *m_sCurrent == '<' ) {

            if( m_sCurrent[1] == '!' )

                skipComment();

            else

                return;    // caller must check if this indicates indeed a tag

        }

        else {

            ++m_sCurrent;

        }

    }

}





//

//   i s L i t e r a l

//



bool tHtmlDoc::isLiteral() const



{

    return isalnum( *m_sCurrent ) || *m_sCurrent == '_' ||

           *m_sCurrent == '\"' || *m_sCurrent == '\'';

}





//

//   s c a n L i t e r a l

//



void tHtmlDoc::scanLiteral( const char *&sStart, int &nSize,

    bool &bIgnCase )



{

    MTG_ASSERT( isLiteral() );



    char cQuote;



    nSize = 0;



    if( isalpha( *m_sCurrent ) || *m_sCurrent == '_' ) {

        sStart = m_sCurrent;

        bIgnCase = true;

        while( isalnum( *m_sCurrent ) || *m_sCurrent == '.' ||

               *m_sCurrent == '-' || *m_sCurrent == '_' ) {

            ++m_sCurrent;

            ++nSize;

        }

    }

    else

        if( isdigit( *m_sCurrent ) ) {

            sStart = m_sCurrent;

            bIgnCase = false;

            while( isdigit( *m_sCurrent ) ) {

                ++m_sCurrent;

                ++nSize;

            }

        }

    else

        if( *m_sCurrent == '\"' || *m_sCurrent == '\'' ) {

            cQuote = *m_sCurrent++;

            sStart = m_sCurrent;

            bIgnCase = false;

            while( *m_sCurrent && *m_sCurrent != cQuote ) {

                ++m_sCurrent;

                ++nSize;

            }

            if( *m_sCurrent )

                ++m_sCurrent;     // be lenient with a missing quote

        }



    skipSpace();

}





//

//   g e t L i t e r a l

//



void tHtmlDoc::getLiteral()



{

    const char *sStart;

    int nSize;

    bool bIgnCase;



    scanLiteral( sStart, nSize, bIgnCase );



        // the standard restricts the size of literals to 1024 bytes

    if( nSize > 1024 )

        nSize = 1024;



        // make sure buffer is large enough

    if( m_nLitSize < nSize || m_sLiteral == 0 ) {

        if( m_sLiteral != 0 )

            delete m_sLiteral;

        m_sLiteral = new char[nSize + 1];

        m_nLitSize = nSize;

    }



    memcpy( m_sLiteral, sStart, nSize );

    m_sLiteral[nSize] = 0;



    if( bIgnCase ) {

        for( int k = 0; k < nSize; ++k )

            m_sLiteral[k] = tolower( m_sLiteral[k] );

    }

}





//

//   s k i p L i t e r a l

//



void tHtmlDoc::skipLiteral()



{

    const char *sStart;

    int nSize;

    bool bIgnCase;



    scanLiteral( sStart, nSize, bIgnCase );

}





//

//   s k i p B a d T a g

//



void tHtmlDoc::skipBadTag()



{

    if( ! m_bInsideTag )

        return;



        // found a badly formed tag; just skip everything

        // until a ">" is found

    while( *m_sCurrent && *m_sCurrent != '>' )

        ++m_sCurrent;

    if( *m_sCurrent )

        ++m_sCurrent;

    m_bInsideTag = false;

}





//

//   s k i p T a g

//



void tHtmlDoc::skipTag()



{

    if( ! m_bInsideTag )

        return;



    while( *m_sCurrent ) {

        switch( *m_sCurrent ) {

            case '>' :

                ++m_sCurrent;

                m_bInsideTag = false;

                return;     // found end of tag



            case '=' :

                ++m_sCurrent;

                skipSpace();

                break;



            default :

                if( isLiteral() ) {

                    skipLiteral();

                }

                else {

                    skipBadTag();

                    return;

                }

                break;

        }

    }

}





//

//   g e t T a g

//



bool tHtmlDoc::getTag( const char *&sName, bool &bStartTag )



{

    MTG_ASSERT( m_sSource );



    if( m_bInsideTag ) {

        skipTag();

        m_bInsideTag = false;

    }



    while( ! eos() ) {

        skipData();



        if( *m_sCurrent == '<' ) {

            ++m_sCurrent;



                // No space is allowed after "<". If there is a space,

                // the "<" is treated as regular text (the Internet

                // Explorer does it this way).



            if( *m_sCurrent == '/' || isLiteral() ) {

                if( *m_sCurrent == '/' ) {

                    bStartTag = false;

                    ++m_sCurrent;

                    skipSpace();   // however, we allow space after "</"

                }

                else {

                    bStartTag = true;

                }



                    // RFC 1866 states that the attribute value must be either

                    // a name or a string literal. We comply with this and

                    // do not allow, as some older browsers so, arbitrary

                    // character sequences without string delimiters.



                if( isLiteral() ) {

                    getLiteral();

                    sName = m_sLiteral;

                    m_bInsideTag = true;

                    return true;

                }



                skipBadTag();

            }

        }

    }



    return false;

}





//

//   g e t A t t r i b u t e

//



bool tHtmlDoc::getAttribute( const char *sName, const char *&sValue )



{

    MTG_ASSERT( m_sSource );



    bool bMatch;



    if( ! m_bInsideTag )

        return false;



    m_bInsideTag = false;   // This function can only retrieve one attribute.



    while( isLiteral() ) {

        getLiteral();



        if( *m_sCurrent == '=' ) {

            ++m_sCurrent;

            skipSpace();



            bMatch = ( strcmp( sName, m_sLiteral ) == 0 );



            if( isLiteral() ) {

                if( ! bMatch ) {

                    skipLiteral();

                }

                else {

                    getLiteral();

                    sValue = m_sLiteral;

                    skipTag();

                    return true;

                }

            }

            else {

                skipBadTag();

                return false;

            }

        }

    }



    skipTag();

    return false;

}





//

//   g e t D a t a

//



bool tHtmlDoc::getData( char *&sData )



{

    MTG_ASSERT( m_sSource );



    if( m_bInsideTag )

        skipTag();



    sData = 0;



    char* sStart = m_sCurrent;

    int nTotal = 0;

    int nSize = 0;



    while( *m_sCurrent ) {

        if( *m_sCurrent == '<' ) {

            if( m_sCurrent[1] != '!' ) {

                break;      // caller must check if this is indeed

                            // a valid tag.

            }

            skipComment();



            if( nSize > 0 ) {

                char* s = new char[nTotal + nSize + 1];



                if( nTotal > 0 ) {

                    memcpy( s, sData, nTotal );

                    delete sData;

                }



                memcpy( &s[nTotal], sStart, nSize );



                nTotal += nSize;

                sStart = m_sCurrent;

                nSize = 0;



                sData = s;

            }

        }

        else {

            ++m_sCurrent;

            ++nSize;

        }

    }



    if( nSize > 0 ) {

        char* s = new char[nTotal + nSize + 1];



        if( nTotal > 0 ) {

            memcpy( s, sData, nTotal );

            delete sData;

        }



        memcpy( &s[nTotal], sStart, nSize );

        nTotal += nSize;

        sData = s;

    }



    if( nTotal > 0 ) {

        sData[nTotal] = 0;

        return true;

    }

    else {

        MTG_ASSERT( sData == 0 );

    }



    return false;

}





//

//   r e s e t

//



void tHtmlDoc::reset()



{

    m_sCurrent = m_sSource;

    m_bInsideTag = false;

}





//

//   t H t m l D o c

//



tHtmlDoc::tHtmlDoc()



{

    init();

}





//

//   t H t m l D o c

//



tHtmlDoc::tHtmlDoc( const tHtmlDoc& HtmlDoc )



{

    init();

    copyFrom( HtmlDoc );

}





//

//   t H t m l D o c

//



tHtmlDoc::tHtmlDoc( const char* sSource )



{

    init();

    operator=( sSource );

}





//

//   ~ t H t m l D o c

//



tHtmlDoc::~tHtmlDoc()



{

    cleanup();

}





//

//   o p e r a t o r =

//



tHtmlDoc& tHtmlDoc::operator=( const tHtmlDoc& HtmlDoc )



{

    if( &HtmlDoc != this )

        copyFrom( HtmlDoc );

    return *this;

}





//

//   o p e r a t o r =

//



tHtmlDoc& tHtmlDoc::operator=( const char* sSource )



{

    if( m_sSource != 0 )

        delete m_sSource;

    m_sSource = StrCopy( sSource );

    reset();

    return *this;

}





//

//   c o p y F r o m

//



void tHtmlDoc::copyFrom( const tHtmlDoc& HtmlDoc )



{

    if( &HtmlDoc == this )

        return;



    if( HtmlDoc.m_sLiteral != 0 ) {

        if( HtmlDoc.m_nLitSize > m_nLitSize ) {

            cleanup();

            m_sLiteral = new char[HtmlDoc.m_nLitSize + 1];

            m_nLitSize = HtmlDoc.m_nLitSize;

        }

        memcpy( m_sLiteral, HtmlDoc.m_sLiteral, m_nLitSize + 1 );

    }



    if( m_sSource != 0 )

        delete m_sSource;

    m_sSource = StrCopy( HtmlDoc.m_sSource );



    if( HtmlDoc.m_sCurrent != 0 )

        m_sCurrent = &m_sSource[HtmlDoc.m_sCurrent - HtmlDoc.m_sSource];

    else

        m_sCurrent = 0;



    m_bInsideTag = HtmlDoc.m_bInsideTag;

}



MTG_END_NAMESPACE

