| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397 |
- /**
- * @file
- *
- * @brief HTML parser, doesn't support HTML5
- *
- * This module orginally implemented an HTML parser based on the
- * (underspecified) HTML 4.0 spec. As of 2.14, the tokenizer
- * conforms to HTML5. Tree construction still follows a custom,
- * unspecified algorithm with many differences to HTML5.
- *
- * The parser defaults to ISO-8859-1, the default encoding of
- * HTTP/1.0.
- *
- * @copyright See Copyright for the status of this software.
- *
- * @author Daniel Veillard
- */
- #ifndef __HTML_PARSER_H__
- #define __HTML_PARSER_H__
- #include <libxml/xmlversion.h>
- #include <libxml/parser.h>
- #ifdef LIBXML_HTML_ENABLED
- #ifdef __cplusplus
- extern "C" {
- #endif
- /*
- * Backward compatibility
- */
- #define UTF8ToHtml htmlUTF8ToHtml
- #define htmlDefaultSubelement(elt) elt->defaultsubelt
- #define htmlElementAllowedHereDesc(parent,elt) \
- htmlElementAllowedHere((parent), (elt)->name)
- #define htmlRequiredAttrs(elt) (elt)->attrs_req
- /*
- * Most of the back-end structures from XML and HTML are shared.
- */
- /** Same as xmlParserCtxt */
- typedef xmlParserCtxt htmlParserCtxt;
- typedef xmlParserCtxtPtr htmlParserCtxtPtr;
- typedef xmlParserNodeInfo htmlParserNodeInfo;
- /** Same as xmlSAXHandler */
- typedef xmlSAXHandler htmlSAXHandler;
- typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
- /** Same as xmlParserInput */
- typedef xmlParserInput htmlParserInput;
- typedef xmlParserInputPtr htmlParserInputPtr;
- typedef xmlDocPtr htmlDocPtr;
- typedef xmlNodePtr htmlNodePtr;
- /** @cond ignore */
- /*
- * Internal description of an HTML element, representing HTML 4.01
- * and XHTML 1.0 (which share the same structure).
- */
- typedef struct _htmlElemDesc htmlElemDesc;
- typedef htmlElemDesc *htmlElemDescPtr;
- struct _htmlElemDesc {
- const char *name; /* The tag name */
- char startTag; /* unused */
- char endTag; /* Whether the end tag can be implied */
- char saveEndTag; /* unused */
- char empty; /* Is this an empty element ? */
- char depr; /* unused */
- char dtd; /* unused */
- char isinline; /* is this a block 0 or inline 1 element */
- const char *desc; /* the description */
- const char** subelts XML_DEPRECATED_MEMBER;
- const char* defaultsubelt XML_DEPRECATED_MEMBER;
- const char** attrs_opt XML_DEPRECATED_MEMBER;
- const char** attrs_depr XML_DEPRECATED_MEMBER;
- const char** attrs_req XML_DEPRECATED_MEMBER;
- int dataMode;
- };
- /*
- * Internal description of an HTML entity.
- */
- typedef struct _htmlEntityDesc htmlEntityDesc;
- typedef htmlEntityDesc *htmlEntityDescPtr;
- struct _htmlEntityDesc {
- unsigned int value; /* the UNICODE value for the character */
- const char *name; /* The entity name */
- const char *desc; /* the description */
- };
- #ifdef LIBXML_SAX1_ENABLED
- /**
- * @deprecated Use #xmlSAX2InitHtmlDefaultSAXHandler
- */
- XML_DEPRECATED
- XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler;
- #endif /* LIBXML_SAX1_ENABLED */
- /** @endcond */
- /*
- * There is only few public functions.
- */
- XML_DEPRECATED
- XMLPUBFUN void
- htmlInitAutoClose (void);
- XML_DEPRECATED
- XMLPUBFUN const htmlElemDesc *
- htmlTagLookup (const xmlChar *tag);
- XML_DEPRECATED
- XMLPUBFUN const htmlEntityDesc *
- htmlEntityLookup(const xmlChar *name);
- XML_DEPRECATED
- XMLPUBFUN const htmlEntityDesc *
- htmlEntityValueLookup(unsigned int value);
- XML_DEPRECATED
- XMLPUBFUN int
- htmlIsAutoClosed(xmlDoc *doc,
- xmlNode *elem);
- XML_DEPRECATED
- XMLPUBFUN int
- htmlAutoCloseTag(xmlDoc *doc,
- const xmlChar *name,
- xmlNode *elem);
- XML_DEPRECATED
- XMLPUBFUN const htmlEntityDesc *
- htmlParseEntityRef(htmlParserCtxt *ctxt,
- const xmlChar **str);
- XML_DEPRECATED
- XMLPUBFUN int
- htmlParseCharRef(htmlParserCtxt *ctxt);
- XML_DEPRECATED
- XMLPUBFUN void
- htmlParseElement(htmlParserCtxt *ctxt);
- XMLPUBFUN htmlParserCtxt *
- htmlNewParserCtxt(void);
- XMLPUBFUN htmlParserCtxt *
- htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
- void *userData);
- XMLPUBFUN htmlParserCtxt *
- htmlCreateMemoryParserCtxt(const char *buffer,
- int size);
- XMLPUBFUN int
- htmlParseDocument(htmlParserCtxt *ctxt);
- XML_DEPRECATED
- XMLPUBFUN xmlDoc *
- htmlSAXParseDoc (const xmlChar *cur,
- const char *encoding,
- htmlSAXHandler *sax,
- void *userData);
- XMLPUBFUN xmlDoc *
- htmlParseDoc (const xmlChar *cur,
- const char *encoding);
- XMLPUBFUN htmlParserCtxt *
- htmlCreateFileParserCtxt(const char *filename,
- const char *encoding);
- XML_DEPRECATED
- XMLPUBFUN xmlDoc *
- htmlSAXParseFile(const char *filename,
- const char *encoding,
- htmlSAXHandler *sax,
- void *userData);
- XMLPUBFUN xmlDoc *
- htmlParseFile (const char *filename,
- const char *encoding);
- XML_DEPRECATED
- XMLPUBFUN int
- htmlUTF8ToHtml (unsigned char *out,
- int *outlen,
- const unsigned char *in,
- int *inlen);
- XML_DEPRECATED
- XMLPUBFUN int
- htmlEncodeEntities(unsigned char *out,
- int *outlen,
- const unsigned char *in,
- int *inlen, int quoteChar);
- XML_DEPRECATED
- XMLPUBFUN int
- htmlIsScriptAttribute(const xmlChar *name);
- XML_DEPRECATED
- XMLPUBFUN int
- htmlHandleOmittedElem(int val);
- #ifdef LIBXML_PUSH_ENABLED
- /*
- * Interfaces for the Push mode.
- */
- XMLPUBFUN htmlParserCtxt *
- htmlCreatePushParserCtxt(htmlSAXHandler *sax,
- void *user_data,
- const char *chunk,
- int size,
- const char *filename,
- xmlCharEncoding enc);
- XMLPUBFUN int
- htmlParseChunk (htmlParserCtxt *ctxt,
- const char *chunk,
- int size,
- int terminate);
- #endif /* LIBXML_PUSH_ENABLED */
- XMLPUBFUN void
- htmlFreeParserCtxt (htmlParserCtxt *ctxt);
- /*
- * New set of simpler/more flexible APIs
- */
- /**
- * This is the set of HTML parser options that can be passed to
- * #htmlReadDoc, #htmlCtxtSetOptions and other functions.
- */
- typedef enum {
- /**
- * No effect as of 2.14.0.
- */
- HTML_PARSE_RECOVER = 1<<0,
- /**
- * Do not default to a doctype if none was found.
- */
- HTML_PARSE_NODEFDTD = 1<<2,
- /**
- * Disable error and warning reports to the error handlers.
- * Errors are still accessible with xmlCtxtGetLastError().
- */
- HTML_PARSE_NOERROR = 1<<5,
- /**
- * Disable warning reports.
- */
- HTML_PARSE_NOWARNING = 1<<6,
- /**
- * No effect.
- */
- HTML_PARSE_PEDANTIC = 1<<7,
- /**
- * Remove some text nodes containing only whitespace from the
- * result document. Which nodes are removed depends on a conservative
- * heuristic. The reindenting feature of the serialization code relies
- * on this option to be set when parsing. Use of this option is
- * DISCOURAGED.
- */
- HTML_PARSE_NOBLANKS = 1<<8,
- /**
- * No effect.
- */
- HTML_PARSE_NONET = 1<<11,
- /**
- * Do not add implied html, head or body elements.
- */
- HTML_PARSE_NOIMPLIED = 1<<13,
- /**
- * Store small strings directly in the node struct to save
- * memory.
- */
- HTML_PARSE_COMPACT = 1<<16,
- /**
- * Relax some internal limits. See XML_PARSE_HUGE in xmlParserOption.
- *
- * @since 2.14.0
- *
- * Use XML_PARSE_HUGE with older versions.
- */
- HTML_PARSE_HUGE = 1<<19,
- /**
- * Ignore the encoding in the HTML declaration. This option is
- * mostly unneeded these days. The only effect is to enforce
- * ISO-8859-1 decoding of ASCII-like data.
- */
- HTML_PARSE_IGNORE_ENC =1<<21,
- /**
- * Enable reporting of line numbers larger than 65535.
- *
- * @since 2.14.0
- *
- * Use XML_PARSE_BIG_LINES with older versions.
- */
- HTML_PARSE_BIG_LINES = 1<<22,
- /**
- * Make the tokenizer emit a SAX callback for each token. This results
- * in unbalanced invocations of startElement and endElement.
- *
- * For now, this is only usable to tokenize HTML5 with custom SAX
- * callbacks. A tree builder isn't implemented yet.
- *
- * @since 2.14.0
- */
- HTML_PARSE_HTML5 = 1<<26
- } htmlParserOption;
- XMLPUBFUN void
- htmlCtxtReset (htmlParserCtxt *ctxt);
- XMLPUBFUN int
- htmlCtxtSetOptions (htmlParserCtxt *ctxt,
- int options);
- XMLPUBFUN int
- htmlCtxtUseOptions (htmlParserCtxt *ctxt,
- int options);
- XMLPUBFUN xmlDoc *
- htmlReadDoc (const xmlChar *cur,
- const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlReadFile (const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlReadMemory (const char *buffer,
- int size,
- const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlReadFd (int fd,
- const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlReadIO (xmlInputReadCallback ioread,
- xmlInputCloseCallback ioclose,
- void *ioctx,
- const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlCtxtParseDocument (htmlParserCtxt *ctxt,
- xmlParserInput *input);
- XMLPUBFUN xmlDoc *
- htmlCtxtReadDoc (xmlParserCtxt *ctxt,
- const xmlChar *cur,
- const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlCtxtReadFile (xmlParserCtxt *ctxt,
- const char *filename,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlCtxtReadMemory (xmlParserCtxt *ctxt,
- const char *buffer,
- int size,
- const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlCtxtReadFd (xmlParserCtxt *ctxt,
- int fd,
- const char *URL,
- const char *encoding,
- int options);
- XMLPUBFUN xmlDoc *
- htmlCtxtReadIO (xmlParserCtxt *ctxt,
- xmlInputReadCallback ioread,
- xmlInputCloseCallback ioclose,
- void *ioctx,
- const char *URL,
- const char *encoding,
- int options);
- /**
- * deprecated content model
- */
- typedef enum {
- HTML_NA = 0 , /* something we don't check at all */
- HTML_INVALID = 0x1 ,
- HTML_DEPRECATED = 0x2 ,
- HTML_VALID = 0x4 ,
- HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
- } htmlStatus ;
- /* Using htmlElemDesc rather than name here, to emphasise the fact
- that otherwise there's a lookup overhead
- */
- XML_DEPRECATED
- XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
- XML_DEPRECATED
- XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
- XML_DEPRECATED
- XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
- XML_DEPRECATED
- XMLPUBFUN htmlStatus htmlNodeStatus(xmlNode *, int) ;
- #ifdef __cplusplus
- }
- #endif
- #endif /* LIBXML_HTML_ENABLED */
- #endif /* __HTML_PARSER_H__ */
|