OCR PDF
Use built-in optical character recognition (OCR) to convert images to text and enable fully text searchable documents for archiving and creation of searchable indexes.
REST API
See our public API Reference for OCR PDF
Text recognition (OCR)
Optical character recognition (OCR) converts images to text so that you
and your users can fully interact with the PDF file. After performing
OCR, the PDF may be fully editable and searchable. The input format must
be application/pdf
.
This sample defaults to the en-us locale. For other languages, see OCR with explicit language.
Please refer the API usage guide to understand how to use our APIs.
Copied to your clipboard1// Get the samples from https://www.adobe.com/go/pdftoolsapi_java_samples2// Run the sample:3// mvn -f pom.xml exec:java -Dexec.mainClass=com.adobe.pdfservices.operation.samples.ocrpdf.OcrPDF45public class OcrPDF {67 // Initialize the logger.8 private static final Logger LOGGER = LoggerFactory.getLogger(OcrPDF.class);910 public static void main(String[] args) {11 try (InputStream inputStream = Files.newInputStream(new File("src/main/resources/ocrInput.pdf").toPath())) {12 // Initial setup, create credentials instance13 Credentials credentials = new ServicePrincipalCredentials(14 System.getenv("PDF_SERVICES_CLIENT_ID"),15 System.getenv("PDF_SERVICES_CLIENT_SECRET"));1617 // Creates a PDF Services instance18 PDFServices pdfServices = new PDFServices(credentials);1920 // Creates an asset(s) from source file(s) and upload21 Asset asset = pdfServices.upload(inputStream, PDFServicesMediaType.PDF.getMediaType());2223 // Creates a new job instance24 OCRJob ocrJob = new OCRJob(asset);2526 // Submit the job and gets the job result27 String location = pdfServices.submit(ocrJob);28 PDFServicesResponse<OCRResult> pdfServicesResponse = pdfServices.getJobResult(location, OCRResult.class);2930 // Get content from the resulting asset(s)31 Asset resultAsset = pdfServicesResponse.getResult().getAsset();32 StreamAsset streamAsset = pdfServices.getContent(resultAsset);3334 // Creates an output stream and copy stream asset's content to it35 Files.createDirectories(Paths.get("output/"));36 OutputStream outputStream = Files.newOutputStream(new File("output/ocrOutput.pdf").toPath());37 LOGGER.info("Saving asset at output/ocrOutput.pdf");38 IOUtils.copy(streamAsset.getInputStream(), outputStream);39 outputStream.close();40 } catch (ServiceApiException | IOException | SDKException | ServiceUsageException ex) {41 LOGGER.error("Exception encountered while executing operation", ex);42 }43 }44}
Copied to your clipboard1// Get the samples from https://www.adobe.com/go/pdftoolsapi_net_samples2// Run the sample:3// cd OcrPDF/4// dotnet run OcrPDF.csproj56 namespace OcrPDF7 {8 class Program9 {10 private static readonly ILog log = LogManager.GetLogger(typeof(Program));11 static void Main()12 {13 //Configure the logging14 ConfigureLogging();15 try16 {17 // Initial setup, create credentials instance.18 Credentials credentials = Credentials.ServicePrincipalCredentialsBuilder()19 .WithClientId("PDF_SERVICES_CLIENT_ID")20 .WithClientSecret("PDF_SERVICES_CLIENT_SECRET")21 .Build();2223 //Create an ExecutionContext using credentials and create a new operation instance.24 ExecutionContext executionContext = ExecutionContext.Create(credentials);25 OCROperation ocrOperation = OCROperation.CreateNew();2627 // Set operation input from a source file.28 FileRef sourceFileRef = FileRef.CreateFromLocalFile(@"ocrInput.pdf");29 ocrOperation.SetInput(sourceFileRef);3031 // Execute the operation.32 FileRef result = ocrOperation.Execute(executionContext);3334 // Save the result to the specified location.35 result.SaveAs(Directory.GetCurrentDirectory() + "/output/ocrOperationOutput.pdf");36 }37 catch (ServiceUsageException ex)38 {39 log.Error("Exception encountered while executing operation", ex);40 }41 // Catch more errors here. . .42 }4344 static void ConfigureLogging()45 {46 ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly());47 XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config"));48 }49 }50 }
Copied to your clipboard1// Get the samples from http://www.adobe.com/go/pdftoolsapi_node_sample2// Run the sample:3// node src/ocr/ocr-pdf.js45const {6 ServicePrincipalCredentials,7 PDFServices,8 MimeType,9 OCRJob,10 OCRResult,11 SDKError,12 ServiceUsageError,13 ServiceApiError14} = require("@adobe/pdfservices-node-sdk");15const fs = require("fs");1617(async () => {18 let readStream;19 try {20 // Initial setup, create credentials instance21 const credentials = new ServicePrincipalCredentials({22 clientId: process.env.PDF_SERVICES_CLIENT_ID,23 clientSecret: process.env.PDF_SERVICES_CLIENT_SECRET24 });2526 // Creates a PDF Services instance27 const pdfServices = new PDFServices({credentials});2829 // Creates an asset(s) from source file(s) and upload30 readStream = fs.createReadStream("./ocrInput.pdf");31 const inputAsset = await pdfServices.upload({32 readStream,33 mimeType: MimeType.PDF34 });3536 // Creates a new job instance37 const job = new OCRJob({inputAsset});3839 // Submit the job and get the job result40 const pollingURL = await pdfServices.submit({job});41 const pdfServicesResponse = await pdfServices.getJobResult({42 pollingURL,43 resultType: OCRResult44 });4546 // Get content from the resulting asset(s)47 const resultAsset = pdfServicesResponse.result.asset;48 const streamAsset = await pdfServices.getContent({asset: resultAsset});4950 // Creates a write stream and copy stream asset's content to it51 const outputFilePath = "./ocrOutput.pdf";52 console.log(`Saving asset at ${outputFilePath}`);5354 const writeStream = fs.createWriteStream(outputFilePath);55 streamAsset.readStream.pipe(writeStream);56 } catch (err) {57 if (err instanceof SDKError || err instanceof ServiceUsageError || err instanceof ServiceApiError) {58 console.log("Exception encountered while executing operation", err);59 } else {60 console.log("Exception encountered while executing operation", err);61 }62 } finally {63 readStream?.destroy();64 }65})();
Copied to your clipboard1// Please refer our REST API docs for more information2// https://developer.adobe.com/document-services/docs/apis/#tag/Ocr34curl --location --request POST 'https://pdf-services.adobe.io/operation/ocr' \5--header 'x-api-key: {{Placeholder for client_id}}' \6--header 'Content-Type: application/json' \7--header 'Authorization: Bearer {{Placeholder for token}}' \8--data-raw '{9 "assetID": "urn:aaid:AS:UE1:23c30ee0-2e4d-46d6-87f2-087832fca718"10}'
OCR with explicit language
You can perform OCR on files in other languages, including German,
French, Danish, and other languages. Refer to OCRSupportedLocale
and
OCRSupportedType
in the API docs for a list of supported OCR locales
and OCR types.
As shown in the OcrPDFWithOptions sample, when you make a PDF file searchable, you specify both the locale (language) and the type. There are two types which produce a different result:
- One type ensures that text is searchable and selectable, but modifies the original image during the cleanup process (for example, deskews it) before placing an invisible text layer over it. This type removes unwanted artifacts and may result in a more readable document in some scenarios.
- The second (EXACT) type, also overlays a searchable text layer over the original image, but in this case, the original image is unchanged. This type produces maximum fidelity to the original image.
Please refer the API usage guide to understand how to use our APIs.
Copied to your clipboard1// Get the samples from https://www.adobe.com/go/pdftoolsapi_java_samples2// Run the sample:3// mvn -f pom.xml exec:java Dexec.mainClass=com.adobe.pdfservices.operation.samples.ocrpdf.OcrPDFWithOptions45 public class OcrPDFWithOptions {6 // Initialize the logger.7 private static final Logger LOGGER = LoggerFactory.getLogger(OcrPDFWithOptions.class);89 public static void main(String[] args) {1011 try (InputStream inputStream = Files.newInputStream(new File("src/main/resources/ocrInput.pdf").toPath())) {12 // Initial setup, create credentials instance13 Credentials credentials = new ServicePrincipalCredentials(14 System.getenv("PDF_SERVICES_CLIENT_ID"),15 System.getenv("PDF_SERVICES_CLIENT_SECRET"));1617 // Creates a PDF Services instance18 PDFServices pdfServices = new PDFServices(credentials);1920 // Creates an asset(s) from source file(s) and upload21 Asset asset = pdfServices.upload(inputStream, PDFServicesMediaType.PDF.getMediaType());2223 // Create parameters for the job24 OCRParams ocrParams = OCRParams.ocrParamsBuilder()25 .withOCRLocale(OCRSupportedLocale.EN_US)26 .withOCRType(OCRSupportedType.SEARCHABLE_IMAGE_EXACT)27 .build();2829 // Creates a new job instance30 OCRJob ocrJob = new OCRJob(asset).setParams(ocrParams);3132 // Submit the job and gets the job result33 String location = pdfServices.submit(ocrJob);34 PDFServicesResponse<OCRResult> pdfServicesResponse = pdfServices.getJobResult(location, OCRResult.class);3536 // Get content from the resulting asset(s)37 Asset resultAsset = pdfServicesResponse.getResult().getAsset();38 StreamAsset streamAsset = pdfServices.getContent(resultAsset);3940 // Creates an output stream and copy stream asset's content to it41 Files.createDirectories(Paths.get("output/"));42 OutputStream outputStream = Files.newOutputStream(new File("output/ocrWithOptionsOutput.pdf").toPath());43 LOGGER.info("Saving asset at output/ocrWithOptionsOutput.pdf");44 IOUtils.copy(streamAsset.getInputStream(), outputStream);45 outputStream.close();46 } catch (ServiceApiException | IOException | SDKException | ServiceUsageException ex) {47 LOGGER.error("Exception encountered while executing operation", ex);48 }49 }50 }
Copied to your clipboard1// Get the samples from https://www.adobe.com/go/pdftoolsapi_net_samples2// Run the sample:3// cd OcrPDFWithOptions4// dotnet run OcrPDFWithOptions.csproj56 namespace OcrPDFWithOptions7 {8 class Program9 {10 private static readonly ILog log = LogManager.GetLogger(typeof(Program));11 static void Main()12 {13 //Configure the logging14 ConfigureLogging();15 try16 {17 // Initial setup, create credentials instance.18 Credentials credentials = Credentials.ServicePrincipalCredentialsBuilder()19 .WithClientId("PDF_SERVICES_CLIENT_ID")20 .WithClientSecret("PDF_SERVICES_CLIENT_SECRET")21 .Build();2223 //Create an ExecutionContext using credentials and create a new operation instance.24 ExecutionContext executionContext = ExecutionContext.Create(credentials);25 OCROperation ocrOperation = OCROperation.CreateNew();2627 // Set operation input from a source file.28 FileRef sourceFileRef = FileRef.CreateFromLocalFile(@"ocrWithOptionsInput.pdf");29 ocrOperation.SetInput(sourceFileRef);30 // Build OCR options from supported locales and OCR-types and set them into the operation31 OCROptions ocrOptions = OCROptions.OCROptionsBuilder()32 .WithOcrLocale(OCRSupportedLocale.EN_US)33 .WithOcrType(OCRSupportedType.SEARCHABLE_IMAGE_EXACT)34 .Build();35 ocrOperation.SetOptions(ocrOptions);3637 // Execute the operation.38 FileRef result = ocrOperation.Execute(executionContext);3940 // Save the result to the specified location.41 result.SaveAs(Directory.GetCurrentDirectory() + "/output/ocrOperationWithOptionsOutput.pdf");42 }43 catch (ServiceUsageException ex)44 {45 log.Error("Exception encountered while executing operation", ex);46 }47 // Catch more errors here . . .48 }4950 static void ConfigureLogging()51 {52 ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly());53 XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config"));54 }55 }56 }
Copied to your clipboard1// Get the samples from http://www.adobe.com/go/pdftoolsapi_node_sample2// Run the sample:3// node src/ocr/ocr-pdf-with-options.js45const {6 ServicePrincipalCredentials,7 PDFServices,8 MimeType,9 OCRJob,10 OCRParams,11 OCRSupportedLocale,12 OCRSupportedType,13 OCRResult,14 SDKError,15 ServiceUsageError,16 ServiceApiError17} = require("@adobe/pdfservices-node-sdk");18const fs = require("fs");1920(async () => {21 let readStream;22 try {23 // Initial setup, create credentials instance24 const credentials = new ServicePrincipalCredentials({25 clientId: process.env.PDF_SERVICES_CLIENT_ID,26 clientSecret: process.env.PDF_SERVICES_CLIENT_SECRET27 });2829 // Creates a PDF Services instance30 const pdfServices = new PDFServices({credentials});3132 // Creates an asset(s) from source file(s) and upload33 readStream = fs.createReadStream("./ocrInput.pdf");34 const inputAsset = await pdfServices.upload({35 readStream,36 mimeType: MimeType.PDF37 });3839 // Create parameters for the job40 const params = new OCRParams({41 ocrLocale: OCRSupportedLocale.EN_US,42 ocrType: OCRSupportedType.SEARCHABLE_IMAGE_EXACT43 });4445 // Creates a new job instance46 const job = new OCRJob({inputAsset, params});4748 // Submit the job and get the job result49 const pollingURL = await pdfServices.submit({job});50 const pdfServicesResponse = await pdfServices.getJobResult({51 pollingURL,52 resultType: OCRResult53 });5455 // Get content from the resulting asset(s)56 const resultAsset = pdfServicesResponse.result.asset;57 const streamAsset = await pdfServices.getContent({asset: resultAsset});5859 // Creates a write stream and copy stream asset's content to it60 const outputFilePath = "./ocrWithOptionsOutput.pdf";61 console.log(`Saving asset at ${outputFilePath}`);6263 const writeStream = fs.createWriteStream(outputFilePath);64 streamAsset.readStream.pipe(writeStream);65 } catch (err) {66 if (err instanceof SDKError || err instanceof ServiceUsageError || err instanceof ServiceApiError) {67 console.log("Exception encountered while executing operation", err);68 } else {69 console.log("Exception encountered while executing operation", err);70 }71 } finally {72 readStream?.destroy();73 }74})();
Copied to your clipboard1// Please refer our REST API docs for more information2// https://developer.adobe.com/document-services/docs/apis/#tag/Ocr34curl --location --request POST 'https://pdf-services.adobe.io/operation/ocr' \5--header 'x-api-key: {{Placeholder for client_id}}' \6--header 'Content-Type: application/json' \7--header 'Authorization: Bearer {{Placeholder for token}}' \8--data-raw '{9 "assetID": "ce8fe9da-99f2-4d01-999e-42b9ce22ec5f",10 "ocrLang": "en-US"11}'