HTMLparser.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. /**
  2. * @file
  3. *
  4. * @brief HTML parser, doesn't support HTML5
  5. *
  6. * This module orginally implemented an HTML parser based on the
  7. * (underspecified) HTML 4.0 spec. As of 2.14, the tokenizer
  8. * conforms to HTML5. Tree construction still follows a custom,
  9. * unspecified algorithm with many differences to HTML5.
  10. *
  11. * The parser defaults to ISO-8859-1, the default encoding of
  12. * HTTP/1.0.
  13. *
  14. * @copyright See Copyright for the status of this software.
  15. *
  16. * @author Daniel Veillard
  17. */
  18. #ifndef __HTML_PARSER_H__
  19. #define __HTML_PARSER_H__
  20. #include <libxml/xmlversion.h>
  21. #include <libxml/parser.h>
  22. #ifdef LIBXML_HTML_ENABLED
  23. #ifdef __cplusplus
  24. extern "C" {
  25. #endif
  26. /*
  27. * Backward compatibility
  28. */
  29. #define UTF8ToHtml htmlUTF8ToHtml
  30. #define htmlDefaultSubelement(elt) elt->defaultsubelt
  31. #define htmlElementAllowedHereDesc(parent,elt) \
  32. htmlElementAllowedHere((parent), (elt)->name)
  33. #define htmlRequiredAttrs(elt) (elt)->attrs_req
  34. /*
  35. * Most of the back-end structures from XML and HTML are shared.
  36. */
  37. /** Same as xmlParserCtxt */
  38. typedef xmlParserCtxt htmlParserCtxt;
  39. typedef xmlParserCtxtPtr htmlParserCtxtPtr;
  40. typedef xmlParserNodeInfo htmlParserNodeInfo;
  41. /** Same as xmlSAXHandler */
  42. typedef xmlSAXHandler htmlSAXHandler;
  43. typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
  44. /** Same as xmlParserInput */
  45. typedef xmlParserInput htmlParserInput;
  46. typedef xmlParserInputPtr htmlParserInputPtr;
  47. typedef xmlDocPtr htmlDocPtr;
  48. typedef xmlNodePtr htmlNodePtr;
  49. /** @cond ignore */
  50. /*
  51. * Internal description of an HTML element, representing HTML 4.01
  52. * and XHTML 1.0 (which share the same structure).
  53. */
  54. typedef struct _htmlElemDesc htmlElemDesc;
  55. typedef htmlElemDesc *htmlElemDescPtr;
  56. struct _htmlElemDesc {
  57. const char *name; /* The tag name */
  58. char startTag; /* unused */
  59. char endTag; /* Whether the end tag can be implied */
  60. char saveEndTag; /* unused */
  61. char empty; /* Is this an empty element ? */
  62. char depr; /* unused */
  63. char dtd; /* unused */
  64. char isinline; /* is this a block 0 or inline 1 element */
  65. const char *desc; /* the description */
  66. const char** subelts XML_DEPRECATED_MEMBER;
  67. const char* defaultsubelt XML_DEPRECATED_MEMBER;
  68. const char** attrs_opt XML_DEPRECATED_MEMBER;
  69. const char** attrs_depr XML_DEPRECATED_MEMBER;
  70. const char** attrs_req XML_DEPRECATED_MEMBER;
  71. int dataMode;
  72. };
  73. /*
  74. * Internal description of an HTML entity.
  75. */
  76. typedef struct _htmlEntityDesc htmlEntityDesc;
  77. typedef htmlEntityDesc *htmlEntityDescPtr;
  78. struct _htmlEntityDesc {
  79. unsigned int value; /* the UNICODE value for the character */
  80. const char *name; /* The entity name */
  81. const char *desc; /* the description */
  82. };
  83. #ifdef LIBXML_SAX1_ENABLED
  84. /**
  85. * @deprecated Use #xmlSAX2InitHtmlDefaultSAXHandler
  86. */
  87. XML_DEPRECATED
  88. XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler;
  89. #endif /* LIBXML_SAX1_ENABLED */
  90. /** @endcond */
  91. /*
  92. * There is only few public functions.
  93. */
  94. XML_DEPRECATED
  95. XMLPUBFUN void
  96. htmlInitAutoClose (void);
  97. XML_DEPRECATED
  98. XMLPUBFUN const htmlElemDesc *
  99. htmlTagLookup (const xmlChar *tag);
  100. XML_DEPRECATED
  101. XMLPUBFUN const htmlEntityDesc *
  102. htmlEntityLookup(const xmlChar *name);
  103. XML_DEPRECATED
  104. XMLPUBFUN const htmlEntityDesc *
  105. htmlEntityValueLookup(unsigned int value);
  106. XML_DEPRECATED
  107. XMLPUBFUN int
  108. htmlIsAutoClosed(xmlDoc *doc,
  109. xmlNode *elem);
  110. XML_DEPRECATED
  111. XMLPUBFUN int
  112. htmlAutoCloseTag(xmlDoc *doc,
  113. const xmlChar *name,
  114. xmlNode *elem);
  115. XML_DEPRECATED
  116. XMLPUBFUN const htmlEntityDesc *
  117. htmlParseEntityRef(htmlParserCtxt *ctxt,
  118. const xmlChar **str);
  119. XML_DEPRECATED
  120. XMLPUBFUN int
  121. htmlParseCharRef(htmlParserCtxt *ctxt);
  122. XML_DEPRECATED
  123. XMLPUBFUN void
  124. htmlParseElement(htmlParserCtxt *ctxt);
  125. XMLPUBFUN htmlParserCtxt *
  126. htmlNewParserCtxt(void);
  127. XMLPUBFUN htmlParserCtxt *
  128. htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
  129. void *userData);
  130. XMLPUBFUN htmlParserCtxt *
  131. htmlCreateMemoryParserCtxt(const char *buffer,
  132. int size);
  133. XMLPUBFUN int
  134. htmlParseDocument(htmlParserCtxt *ctxt);
  135. XML_DEPRECATED
  136. XMLPUBFUN xmlDoc *
  137. htmlSAXParseDoc (const xmlChar *cur,
  138. const char *encoding,
  139. htmlSAXHandler *sax,
  140. void *userData);
  141. XMLPUBFUN xmlDoc *
  142. htmlParseDoc (const xmlChar *cur,
  143. const char *encoding);
  144. XMLPUBFUN htmlParserCtxt *
  145. htmlCreateFileParserCtxt(const char *filename,
  146. const char *encoding);
  147. XML_DEPRECATED
  148. XMLPUBFUN xmlDoc *
  149. htmlSAXParseFile(const char *filename,
  150. const char *encoding,
  151. htmlSAXHandler *sax,
  152. void *userData);
  153. XMLPUBFUN xmlDoc *
  154. htmlParseFile (const char *filename,
  155. const char *encoding);
  156. XML_DEPRECATED
  157. XMLPUBFUN int
  158. htmlUTF8ToHtml (unsigned char *out,
  159. int *outlen,
  160. const unsigned char *in,
  161. int *inlen);
  162. XML_DEPRECATED
  163. XMLPUBFUN int
  164. htmlEncodeEntities(unsigned char *out,
  165. int *outlen,
  166. const unsigned char *in,
  167. int *inlen, int quoteChar);
  168. XML_DEPRECATED
  169. XMLPUBFUN int
  170. htmlIsScriptAttribute(const xmlChar *name);
  171. XML_DEPRECATED
  172. XMLPUBFUN int
  173. htmlHandleOmittedElem(int val);
  174. #ifdef LIBXML_PUSH_ENABLED
  175. /*
  176. * Interfaces for the Push mode.
  177. */
  178. XMLPUBFUN htmlParserCtxt *
  179. htmlCreatePushParserCtxt(htmlSAXHandler *sax,
  180. void *user_data,
  181. const char *chunk,
  182. int size,
  183. const char *filename,
  184. xmlCharEncoding enc);
  185. XMLPUBFUN int
  186. htmlParseChunk (htmlParserCtxt *ctxt,
  187. const char *chunk,
  188. int size,
  189. int terminate);
  190. #endif /* LIBXML_PUSH_ENABLED */
  191. XMLPUBFUN void
  192. htmlFreeParserCtxt (htmlParserCtxt *ctxt);
  193. /*
  194. * New set of simpler/more flexible APIs
  195. */
  196. /**
  197. * This is the set of HTML parser options that can be passed to
  198. * #htmlReadDoc, #htmlCtxtSetOptions and other functions.
  199. */
  200. typedef enum {
  201. /**
  202. * No effect as of 2.14.0.
  203. */
  204. HTML_PARSE_RECOVER = 1<<0,
  205. /**
  206. * Do not default to a doctype if none was found.
  207. */
  208. HTML_PARSE_NODEFDTD = 1<<2,
  209. /**
  210. * Disable error and warning reports to the error handlers.
  211. * Errors are still accessible with xmlCtxtGetLastError().
  212. */
  213. HTML_PARSE_NOERROR = 1<<5,
  214. /**
  215. * Disable warning reports.
  216. */
  217. HTML_PARSE_NOWARNING = 1<<6,
  218. /**
  219. * No effect.
  220. */
  221. HTML_PARSE_PEDANTIC = 1<<7,
  222. /**
  223. * Remove some text nodes containing only whitespace from the
  224. * result document. Which nodes are removed depends on a conservative
  225. * heuristic. The reindenting feature of the serialization code relies
  226. * on this option to be set when parsing. Use of this option is
  227. * DISCOURAGED.
  228. */
  229. HTML_PARSE_NOBLANKS = 1<<8,
  230. /**
  231. * No effect.
  232. */
  233. HTML_PARSE_NONET = 1<<11,
  234. /**
  235. * Do not add implied html, head or body elements.
  236. */
  237. HTML_PARSE_NOIMPLIED = 1<<13,
  238. /**
  239. * Store small strings directly in the node struct to save
  240. * memory.
  241. */
  242. HTML_PARSE_COMPACT = 1<<16,
  243. /**
  244. * Relax some internal limits. See XML_PARSE_HUGE in xmlParserOption.
  245. *
  246. * @since 2.14.0
  247. *
  248. * Use XML_PARSE_HUGE with older versions.
  249. */
  250. HTML_PARSE_HUGE = 1<<19,
  251. /**
  252. * Ignore the encoding in the HTML declaration. This option is
  253. * mostly unneeded these days. The only effect is to enforce
  254. * ISO-8859-1 decoding of ASCII-like data.
  255. */
  256. HTML_PARSE_IGNORE_ENC =1<<21,
  257. /**
  258. * Enable reporting of line numbers larger than 65535.
  259. *
  260. * @since 2.14.0
  261. *
  262. * Use XML_PARSE_BIG_LINES with older versions.
  263. */
  264. HTML_PARSE_BIG_LINES = 1<<22,
  265. /**
  266. * Make the tokenizer emit a SAX callback for each token. This results
  267. * in unbalanced invocations of startElement and endElement.
  268. *
  269. * For now, this is only usable to tokenize HTML5 with custom SAX
  270. * callbacks. A tree builder isn't implemented yet.
  271. *
  272. * @since 2.14.0
  273. */
  274. HTML_PARSE_HTML5 = 1<<26
  275. } htmlParserOption;
  276. XMLPUBFUN void
  277. htmlCtxtReset (htmlParserCtxt *ctxt);
  278. XMLPUBFUN int
  279. htmlCtxtSetOptions (htmlParserCtxt *ctxt,
  280. int options);
  281. XMLPUBFUN int
  282. htmlCtxtUseOptions (htmlParserCtxt *ctxt,
  283. int options);
  284. XMLPUBFUN xmlDoc *
  285. htmlReadDoc (const xmlChar *cur,
  286. const char *URL,
  287. const char *encoding,
  288. int options);
  289. XMLPUBFUN xmlDoc *
  290. htmlReadFile (const char *URL,
  291. const char *encoding,
  292. int options);
  293. XMLPUBFUN xmlDoc *
  294. htmlReadMemory (const char *buffer,
  295. int size,
  296. const char *URL,
  297. const char *encoding,
  298. int options);
  299. XMLPUBFUN xmlDoc *
  300. htmlReadFd (int fd,
  301. const char *URL,
  302. const char *encoding,
  303. int options);
  304. XMLPUBFUN xmlDoc *
  305. htmlReadIO (xmlInputReadCallback ioread,
  306. xmlInputCloseCallback ioclose,
  307. void *ioctx,
  308. const char *URL,
  309. const char *encoding,
  310. int options);
  311. XMLPUBFUN xmlDoc *
  312. htmlCtxtParseDocument (htmlParserCtxt *ctxt,
  313. xmlParserInput *input);
  314. XMLPUBFUN xmlDoc *
  315. htmlCtxtReadDoc (xmlParserCtxt *ctxt,
  316. const xmlChar *cur,
  317. const char *URL,
  318. const char *encoding,
  319. int options);
  320. XMLPUBFUN xmlDoc *
  321. htmlCtxtReadFile (xmlParserCtxt *ctxt,
  322. const char *filename,
  323. const char *encoding,
  324. int options);
  325. XMLPUBFUN xmlDoc *
  326. htmlCtxtReadMemory (xmlParserCtxt *ctxt,
  327. const char *buffer,
  328. int size,
  329. const char *URL,
  330. const char *encoding,
  331. int options);
  332. XMLPUBFUN xmlDoc *
  333. htmlCtxtReadFd (xmlParserCtxt *ctxt,
  334. int fd,
  335. const char *URL,
  336. const char *encoding,
  337. int options);
  338. XMLPUBFUN xmlDoc *
  339. htmlCtxtReadIO (xmlParserCtxt *ctxt,
  340. xmlInputReadCallback ioread,
  341. xmlInputCloseCallback ioclose,
  342. void *ioctx,
  343. const char *URL,
  344. const char *encoding,
  345. int options);
  346. /**
  347. * deprecated content model
  348. */
  349. typedef enum {
  350. HTML_NA = 0 , /* something we don't check at all */
  351. HTML_INVALID = 0x1 ,
  352. HTML_DEPRECATED = 0x2 ,
  353. HTML_VALID = 0x4 ,
  354. HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
  355. } htmlStatus ;
  356. /* Using htmlElemDesc rather than name here, to emphasise the fact
  357. that otherwise there's a lookup overhead
  358. */
  359. XML_DEPRECATED
  360. XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
  361. XML_DEPRECATED
  362. XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
  363. XML_DEPRECATED
  364. XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
  365. XML_DEPRECATED
  366. XMLPUBFUN htmlStatus htmlNodeStatus(xmlNode *, int) ;
  367. #ifdef __cplusplus
  368. }
  369. #endif
  370. #endif /* LIBXML_HTML_ENABLED */
  371. #endif /* __HTML_PARSER_H__ */