public class TesseractOCRParser
extends org.apache.tika.parser.AbstractExternalProcessParser
implements org.apache.tika.config.Initializable
TesseractOCRConfig object and pass it through a
ParseContext. Tesseract-ocr must be installed and on system path or the path
to its root folder must be provided:
TesseractOCRConfig config = new TesseractOCRConfig();
//Needed if tesseract is not on system path
config.setTesseractPath(tesseractFolder);
parseContext.set(TesseractOCRConfig.class, config);
| Constructor and Description |
|---|
TesseractOCRParser() |
| Modifier and Type | Method and Description |
|---|---|
void |
checkInitialization(org.apache.tika.config.InitializableProblemHandler problemHandler) |
TesseractOCRConfig |
getDefaultConfig() |
Set<org.apache.tika.mime.MediaType> |
getSupportedTypes(org.apache.tika.parser.ParseContext context) |
boolean |
hasTesseract(TesseractOCRConfig config) |
protected boolean |
hasWarned() |
void |
initialize(Map<String,org.apache.tika.config.Param> params)
no-op
|
void |
parse(Image image,
ContentHandler handler,
org.apache.tika.metadata.Metadata metadata,
org.apache.tika.parser.ParseContext context) |
void |
parse(InputStream stream,
ContentHandler handler,
org.apache.tika.metadata.Metadata metadata,
org.apache.tika.parser.ParseContext parseContext) |
void |
parseInline(InputStream stream,
org.apache.tika.sax.XHTMLContentHandler xhtml,
org.apache.tika.parser.ParseContext parseContext,
TesseractOCRConfig config)
Use this to parse content without starting a new document.
|
void |
parseInline(InputStream stream,
org.apache.tika.sax.XHTMLContentHandler xhtml,
TesseractOCRConfig config)
|
void |
setApplyRotation(boolean applyRotation) |
void |
setColorspace(String colorspace) |
void |
setDensity(int density) |
void |
setDepth(int depth) |
void |
setEnableImageProcessing(int enableImageProcessing) |
void |
setFilter(String filter) |
void |
setImageMagickPath(String imageMagickPath) |
void |
setLanguage(String language) |
void |
setMaxFileSizeToOcr(long maxFileSizeToOcr) |
void |
setMinFileSizeToOcr(long minFileSizeToOcr) |
void |
setOutputType(String outputType) |
void |
setPageSegMode(String pageSegMode) |
void |
setPreserveInterwordSpacing(boolean preserveInterwordSpacing) |
void |
setResize(int resize) |
void |
setTessdataPath(String tessdataPath) |
void |
setTesseractPath(String tesseractPath) |
void |
setTimeout(int timeout) |
protected void |
warn() |
public Set<org.apache.tika.mime.MediaType> getSupportedTypes(org.apache.tika.parser.ParseContext context)
getSupportedTypes in interface org.apache.tika.parser.Parserpublic boolean hasTesseract(TesseractOCRConfig config)
public void parse(Image image, ContentHandler handler, org.apache.tika.metadata.Metadata metadata, org.apache.tika.parser.ParseContext context) throws IOException, SAXException, org.apache.tika.exception.TikaException
IOExceptionSAXExceptionorg.apache.tika.exception.TikaExceptionpublic void parse(InputStream stream, ContentHandler handler, org.apache.tika.metadata.Metadata metadata, org.apache.tika.parser.ParseContext parseContext) throws IOException, SAXException, org.apache.tika.exception.TikaException
parse in interface org.apache.tika.parser.ParserIOExceptionSAXExceptionorg.apache.tika.exception.TikaExceptionpublic void parseInline(InputStream stream, org.apache.tika.sax.XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, org.apache.tika.exception.TikaException
stream - inputstreamxhtml - handlerconfig - TesseractOCRConfig to use for this parseIOExceptionSAXExceptionorg.apache.tika.exception.TikaExceptionpublic void parseInline(InputStream stream, org.apache.tika.sax.XHTMLContentHandler xhtml, org.apache.tika.parser.ParseContext parseContext, TesseractOCRConfig config) throws IOException, SAXException, org.apache.tika.exception.TikaException
stream - inputstreamxhtml - handlerconfig - TesseractOCRConfig to use for this parseIOExceptionSAXExceptionorg.apache.tika.exception.TikaExceptionpublic void initialize(Map<String,org.apache.tika.config.Param> params) throws org.apache.tika.exception.TikaConfigException
initialize in interface org.apache.tika.config.Initializableparams - params to use for initializationorg.apache.tika.exception.TikaConfigExceptionpublic void checkInitialization(org.apache.tika.config.InitializableProblemHandler problemHandler)
throws org.apache.tika.exception.TikaConfigException
checkInitialization in interface org.apache.tika.config.Initializableorg.apache.tika.exception.TikaConfigExceptionprotected boolean hasWarned()
protected void warn()
@Field public void setTesseractPath(String tesseractPath)
@Field public void setTessdataPath(String tessdataPath)
@Field public void setLanguage(String language)
@Field public void setPageSegMode(String pageSegMode)
@Field public void setMaxFileSizeToOcr(long maxFileSizeToOcr)
@Field public void setMinFileSizeToOcr(long minFileSizeToOcr)
@Field public void setTimeout(int timeout)
@Field public void setOutputType(String outputType)
@Field public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing)
@Field public void setEnableImageProcessing(int enableImageProcessing)
@Field public void setImageMagickPath(String imageMagickPath)
@Field public void setDensity(int density)
@Field public void setDepth(int depth)
@Field public void setColorspace(String colorspace)
@Field public void setFilter(String filter)
@Field public void setResize(int resize)
@Field public void setApplyRotation(boolean applyRotation)
public TesseractOCRConfig getDefaultConfig()
Copyright © 2007–2022 The Apache Software Foundation. All rights reserved.