public class TesseractOCRConfig
extends java.lang.Object
implements java.io.Serializable
This allows to enable TesseractOCRParser and set its parameters:
TesseractOCRConfig config = new TesseractOCRConfig();
config.setTesseractPath(tesseractFolder);
parseContext.set(TesseractOCRConfig.class, config);
Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in, tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own and placing it in the package org/apache/tika/parser/ocr on the classpath.
Modifier and Type | Class and Description |
---|---|
static class |
TesseractOCRConfig.OUTPUT_TYPE |
Constructor and Description |
---|
TesseractOCRConfig()
Default contructor.
|
TesseractOCRConfig(java.io.InputStream is)
Loads properties from InputStream and then tries to close InputStream.
|
Modifier and Type | Method and Description |
---|---|
void |
addOtherTesseractConfig(java.lang.String key,
java.lang.String value)
Add a key-value pair to pass to Tesseract using its -c command line option.
|
boolean |
getApplyRotation() |
java.lang.String |
getColorspace() |
int |
getDensity() |
int |
getDepth() |
java.lang.String |
getFilter() |
java.lang.String |
getImageMagickPath() |
java.lang.String |
getLanguage() |
long |
getMaxFileSizeToOcr() |
long |
getMinFileSizeToOcr() |
java.util.Map<java.lang.String,java.lang.String> |
getOtherTesseractConfig() |
TesseractOCRConfig.OUTPUT_TYPE |
getOutputType() |
java.lang.String |
getPageSegMode() |
java.lang.String |
getPageSeparator() |
boolean |
getPreserveInterwordSpacing() |
int |
getResize() |
java.lang.String |
getTessdataPath() |
java.lang.String |
getTesseractPath() |
int |
getTimeout() |
int |
isEnableImageProcessing() |
void |
setApplyRotation(boolean applyRotation)
Sets whether or not a rotation value should be calculated and passed to ImageMagick.
|
void |
setColorspace(java.lang.String colorspace) |
void |
setDensity(int density) |
void |
setDepth(int depth) |
void |
setEnableImageProcessing(int enableImageProcessing)
Set the value to true if processing is to be enabled.
|
void |
setFilter(java.lang.String filter) |
void |
setImageMagickPath(java.lang.String imageMagickPath)
Set the path to the ImageMagick executable directory, needed if it is not on system path.
|
void |
setLanguage(java.lang.String language)
Set tesseract language dictionary to be used.
|
void |
setMaxFileSizeToOcr(long maxFileSizeToOcr)
Set maximum file size to submit file to ocr.
|
void |
setMinFileSizeToOcr(long minFileSizeToOcr)
Set minimum file size to submit file to ocr.
|
void |
setOutputType(java.lang.String outputType) |
void |
setOutputType(TesseractOCRConfig.OUTPUT_TYPE outputType)
Set output type from ocr process.
|
void |
setPageSegMode(java.lang.String pageSegMode)
Set tesseract page segmentation mode.
|
void |
setPageSeparator(java.lang.String pageSeparator)
The page separator to use in plain text output.
|
void |
setPreserveInterwordSpacing(boolean preserveInterwordSpacing)
Whether or not to maintain interword spacing.
|
void |
setResize(int resize) |
void |
setTessdataPath(java.lang.String tessdataPath)
Set the path to the 'tessdata' folder, which contains language files and config files.
|
void |
setTesseractPath(java.lang.String tesseractPath)
Set the path to the Tesseract executable's directory, needed if it is not on system path.
|
void |
setTimeout(int timeout)
Set maximum time (seconds) to wait for the ocring process to terminate.
|
void |
setTrustedPageSeparator(java.lang.String pageSeparator)
Same as
setPageSeparator(String) but does not perform
any checks on the string. |
public TesseractOCRConfig()
public TesseractOCRConfig(java.io.InputStream is)
is
- public java.lang.String getTesseractPath()
setTesseractPath(String tesseractPath)
public void setTesseractPath(java.lang.String tesseractPath)
Note that if you set this value, it is highly recommended that you also
set the path to the 'tessdata' folder using setTessdataPath(java.lang.String)
.
public java.lang.String getTessdataPath()
setTessdataPath(String tessdataPath)
public void setTessdataPath(java.lang.String tessdataPath)
public java.lang.String getLanguage()
setLanguage(String language)
public void setLanguage(java.lang.String language)
public java.lang.String getPageSegMode()
setPageSegMode(String pageSegMode)
public void setPageSegMode(java.lang.String pageSegMode)
public java.lang.String getPageSeparator()
setPageSeparator(String pageSeparator)
public void setPageSeparator(java.lang.String pageSeparator)
pageSeparator
- public void setTrustedPageSeparator(java.lang.String pageSeparator)
setPageSeparator(String)
but does not perform
any checks on the string.pageSeparator
- public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing)
false
.preserveInterwordSpacing
- public boolean getPreserveInterwordSpacing()
public long getMinFileSizeToOcr()
public void setMinFileSizeToOcr(long minFileSizeToOcr)
public long getMaxFileSizeToOcr()
public void setMaxFileSizeToOcr(long maxFileSizeToOcr)
public void setTimeout(int timeout)
public int getTimeout()
setTimeout(int timeout)
public void setOutputType(TesseractOCRConfig.OUTPUT_TYPE outputType)
TesseractOCRConfig.OUTPUT_TYPE.TXT
.public void setOutputType(java.lang.String outputType)
public TesseractOCRConfig.OUTPUT_TYPE getOutputType()
setOutputType(OUTPUT_TYPE outputType)
public int isEnableImageProcessing()
setEnableImageProcessing(int)
public void setEnableImageProcessing(int enableImageProcessing)
public int getDensity()
public void setDensity(int density)
density
- the density to set. Valid range of values is 150-1200.
Default value is 300.public int getDepth()
public void setDepth(int depth)
depth
- the depth to set. Valid values are 2, 4, 8, 16, 32, 64, 256, 4096.
Default value is 4.public java.lang.String getColorspace()
public void setColorspace(java.lang.String colorspace)
colorspace
- the colorspace to set
Deafult value is gray.public java.lang.String getFilter()
public void setFilter(java.lang.String filter)
filter
- the filter to set. Valid values are point, hermite, cubic, box, gaussian, catrom, triangle, quadratic and mitchell.
Default value is triangle.public int getResize()
public void setResize(int resize)
resize
- the resize to set. Valid range of values is 100-900.
Default value is 900.public java.lang.String getImageMagickPath()
setImageMagickPath(String imageMagickPath)
public void setImageMagickPath(java.lang.String imageMagickPath)
imageMagickPath
- to ImageMagick executable directory.public boolean getApplyRotation()
public void setApplyRotation(boolean applyRotation)
applyRotation
- to calculate and apply rotation, false to skip. Default is false, true required Python installed.public java.util.Map<java.lang.String,java.lang.String> getOtherTesseractConfig()
addOtherTesseractConfig(String, String)
public void addOtherTesseractConfig(java.lang.String key, java.lang.String value)
key
- value
- "Copyright © 2010 - 2020 Adobe Systems Incorporated. All Rights Reserved"