public class BoilerpipeContentHandler
extends de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler
ContentHandler
object passed to
HtmlParser.parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)
Constructor and Description |
---|
BoilerpipeContentHandler(org.xml.sax.ContentHandler delegate)
Creates a new boilerpipe-based content extractor, using the
DefaultExtractor extraction rules and "delegate" as the content handler. |
BoilerpipeContentHandler(org.xml.sax.ContentHandler delegate,
de.l3s.boilerpipe.BoilerpipeExtractor extractor)
Creates a new boilerpipe-based content extractor, using the given
extraction rules.
|
BoilerpipeContentHandler(java.io.Writer writer)
Creates a content handler that writes XHTML body character events to
the given writer.
|
Modifier and Type | Method and Description |
---|---|
void |
characters(char[] chars,
int offset,
int length) |
void |
endDocument() |
void |
endElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName) |
de.l3s.boilerpipe.document.TextDocument |
getTextDocument()
Retrieves the built TextDocument
|
boolean |
isIncludeMarkup() |
void |
setIncludeMarkup(boolean includeMarkup) |
void |
startDocument() |
void |
startElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.Attributes atts) |
void |
startPrefixMapping(java.lang.String prefix,
java.lang.String uri) |
public BoilerpipeContentHandler(org.xml.sax.ContentHandler delegate)
DefaultExtractor
extraction rules and "delegate" as the content handler.delegate
- The ContentHandler
objectpublic BoilerpipeContentHandler(java.io.Writer writer)
writer
- writerpublic BoilerpipeContentHandler(org.xml.sax.ContentHandler delegate, de.l3s.boilerpipe.BoilerpipeExtractor extractor)
delegate
- The ContentHandler
objectextractor
- Extraction rules to use, e.g. ArticleExtractor
public boolean isIncludeMarkup()
public void setIncludeMarkup(boolean includeMarkup)
public de.l3s.boilerpipe.document.TextDocument getTextDocument()
public void startDocument() throws org.xml.sax.SAXException
startDocument
in interface org.xml.sax.ContentHandler
startDocument
in class de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler
org.xml.sax.SAXException
public void startPrefixMapping(java.lang.String prefix, java.lang.String uri) throws org.xml.sax.SAXException
startPrefixMapping
in interface org.xml.sax.ContentHandler
startPrefixMapping
in class de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler
org.xml.sax.SAXException
public void startElement(java.lang.String uri, java.lang.String localName, java.lang.String qName, org.xml.sax.Attributes atts) throws org.xml.sax.SAXException
startElement
in interface org.xml.sax.ContentHandler
startElement
in class de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler
org.xml.sax.SAXException
public void characters(char[] chars, int offset, int length) throws org.xml.sax.SAXException
characters
in interface org.xml.sax.ContentHandler
characters
in class de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler
org.xml.sax.SAXException
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws org.xml.sax.SAXException
endElement
in interface org.xml.sax.ContentHandler
endElement
in class de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler
org.xml.sax.SAXException
public void endDocument() throws org.xml.sax.SAXException
endDocument
in interface org.xml.sax.ContentHandler
endDocument
in class de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler
org.xml.sax.SAXException
Copyright © 2010 - 2020 Adobe. All Rights Reserved