Extract PDF Content & Structure
Extract content from scanned and native PDFs to use for database insertion, content republishing, RPA, and more
REST API
Node js
.Net
Java
Python
Copied to your clipboard// Please refer our Rest API docs for more information// https://developer.adobe.com/document-services/docs/apis/#tag/Extract-PDFcurl --location --request POST 'https://pdf-services.adobe.io/operation/extractpdf' \--header 'x-api-key: {{Placeholder for client_id}}' \--header 'Content-Type: application/json' \--header 'Authorization: Bearer {{Placeholder for token}}' \--data-raw '{"assetID": "urn:aaid:AS:UE1:23c30ee0-2e4d-46d6-87f2-087832fca718","elementsToExtract": ["text"]}'// Legacy API can be found here// https://documentcloud.adobe.com/document-services/index.html#post-extractPDF
Copied to your clipboard// Get the samples from http://www.adobe.com/go/pdftoolsapi_node_sample// Run the sample:// node src/extractpdf/extract-text-info-from-pdf.jsconst {ServicePrincipalCredentials,PDFServices,MimeType,ExtractPDFParams,ExtractElementType,ExtractPDFJob,ExtractPDFResult,SDKError,ServiceUsageError,ServiceApiError} = require("@adobe/pdfservices-node-sdk");const fs = require("fs");(async () => {let readStream;try {// Initial setup, create credentials instanceconst credentials = new ServicePrincipalCredentials({clientId: process.env.PDF_SERVICES_CLIENT_ID,clientSecret: process.env.PDF_SERVICES_CLIENT_SECRET});// Creates a PDF Services instanceconst pdfServices = new PDFServices({credentials});// Creates an asset(s) from source file(s) and uploadreadStream = fs.createReadStream("./extractPDFInput.pdf");const inputAsset = await pdfServices.upload({readStream,mimeType: MimeType.PDF});// Create parameters for the jobconst params = new ExtractPDFParams({elementsToExtract: [ExtractElementType.TEXT]});// Creates a new job instanceconst job = new ExtractPDFJob({inputAsset,params});// Submit the job and get the job resultconst pollingURL = await pdfServices.submit({job});const pdfServicesResponse = await pdfServices.getJobResult({pollingURL,resultType: ExtractPDFResult});// Get content from the resulting asset(s)const resultAsset = pdfServicesResponse.result.resource;const streamAsset = await pdfServices.getContent({asset: resultAsset});// Creates a write stream and copy stream asset's content to itconst outputFilePath = "./ExtractTextInfoFromPDF.zip";console.log(`Saving asset at ${outputFilePath}`);const writeStream = fs.createWriteStream(outputFilePath);streamAsset.readStream.pipe(writeStream);} catch (err) {if (err instanceof SDKError || err instanceof ServiceUsageError || err instanceof ServiceApiError) {console.log("Exception encountered while executing operation", err);} else {console.log("Exception encountered while executing operation", err);}} finally {readStream?.destroy();}})();
Copied to your clipboard// Get the samples from https://www.adobe.com/go/pdftoolsapi_net_samples// Run the sample:// cd ExtractTextInfoFromPDF/// dotnet run ExtractTextInfoFromPDF.csprojnamespace ExtractTextInfoFromPDF{class Program{private static readonly ILog log = LogManager.GetLogger(typeof(Program));static void Main(){// Configure the logging.ConfigureLogging();try{// Initial setup, create credentials instanceICredentials credentials = new ServicePrincipalCredentials(Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_ID"),Environment.GetEnvironmentVariable("PDF_SERVICES_CLIENT_SECRET"));// Creates a PDF Services instancePDFServices pdfServices = new PDFServices(credentials);// Creates an asset from source file and uploadusing Stream inputStream = File.OpenRead(@"extractPDFInput.pdf");IAsset asset = pdfServices.Upload(inputStream, PDFServicesMediaType.PDF.GetMIMETypeValue());// Create parameters for the jobExtractPDFParams extractPDFParams = ExtractPDFParams.ExtractPDFParamsBuilder().AddElementToExtract(ExtractElementType.TEXT).Build();// Creates a new job instanceExtractPDFJob extractPDFJob = new ExtractPDFJob(asset).SetParams(extractPDFParams);// Submits the job and gets the job resultString location = pdfServices.Submit(extractPDFJob);PDFServicesResponse<ExtractPDFResult> pdfServicesResponse =pdfServices.GetJobResult<ExtractPDFResult>(location, typeof(ExtractPDFResult));// Get content from the resulting asset(s)IAsset resultAsset = pdfServicesResponse.Result.Resource;StreamAsset streamAsset = pdfServices.GetContent(resultAsset);// Creating output streams and copying stream asset's content to itString outputFilePath = "/output/ExtractTextInfoFromPDF.zip";new FileInfo(Directory.GetCurrentDirectory() + outputFilePath).Directory.Create();Stream outputStream = File.OpenWrite(Directory.GetCurrentDirectory() + outputFilePath);streamAsset.Stream.CopyTo(outputStream);outputStream.Close();}catch (ServiceUsageException ex){log.Error("Exception encountered while executing operation", ex);}catch (ServiceApiException ex){log.Error("Exception encountered while executing operation", ex);}catch (SDKException ex){log.Error("Exception encountered while executing operation", ex);}catch (IOException ex){log.Error("Exception encountered while executing operation", ex);}catch (Exception ex){log.Error("Exception encountered while executing operation", ex);}}static void ConfigureLogging(){ILoggerRepository logRepository = LogManager.GetRepository(Assembly.GetEntryAssembly());XmlConfigurator.Configure(logRepository, new FileInfo("log4net.config"));}}}
Copied to your clipboard// Get the samples from https://www.adobe.com/go/pdftoolsapi_java_samples// Run the sample:// mvn -f pom.xml exec:java -Dexec.mainClass=com.adobe.pdfservices.operation.samples.extractpdf.ExtractTextInfoFromPDFpublic class ExtractTextInfoFromPDF {private static final Logger LOGGER = LoggerFactory.getLogger(ExtractTextInfoFromPDF.class);public static void main(String[] args) {try (InputStream inputStream = Files.newInputStream(new File("src/main/resources/extractPdfInput.pdf").toPath())) {// Initial setup, create credentials instanceCredentials credentials = new ServicePrincipalCredentials(System.getenv("PDF_SERVICES_CLIENT_ID"),System.getenv("PDF_SERVICES_CLIENT_SECRET"));// Creates a PDF Services instancePDFServices pdfServices = new PDFServices(credentials);// Creates an asset(s) from source file(s) and uploadAsset asset = pdfServices.upload(inputStream, PDFServicesMediaType.PDF.getMediaType());// Create parameters for the jobExtractPDFParams extractPDFParams = ExtractPDFParams.extractPDFParamsBuilder().addElementsToExtract(Arrays.asList(ExtractElementType.TEXT)).build();// Creates a new job instanceExtractPDFJob extractPDFJob = new ExtractPDFJob(asset).setParams(extractPDFParams);// Submit the job and gets the job resultString location = pdfServices.submit(extractPDFJob);PDFServicesResponse<ExtractPDFResult> pdfServicesResponse = pdfServices.getJobResult(location, ExtractPDFResult.class);// Get content from the resulting asset(s)Asset resultAsset = pdfServicesResponse.getResult().getResource();StreamAsset streamAsset = pdfServices.getContent(resultAsset);// Creates an output stream and copy stream asset's content to itFiles.createDirectories(Paths.get("output/"));OutputStream outputStream = Files.newOutputStream(new File("output/ExtractTextInfoFromPDF.zip").toPath());LOGGER.info("Saving asset at output/ExtractTextInfoFromPDF.zip");IOUtils.copy(streamAsset.getInputStream(), outputStream);outputStream.close();} catch (ServiceApiException | IOException | SDKException | ServiceUsageException e) {LOGGER.error("Exception encountered while executing operation", e);}}}
Copied to your clipboard# Get the samples from https://github.com/adobe/pdfservices-python-sdk-samples# Run the sample:# python src/extractpdf/extract_text_info_from_pdf.py# Initialize the loggerlogging.basicConfig(level=logging.INFO)class ExtractTextInfoFromPDF:def __init__(self):try:file = open("extractPdfInput.pdf", "rb")input_stream = file.read()file.close()# Initial setup, create credentials instancecredentials = ServicePrincipalCredentials(client_id=os.getenv("PDF_SERVICES_CLIENT_ID"),client_secret=os.getenv("PDF_SERVICES_CLIENT_SECRET"),)# Creates a PDF Services instancepdf_services = PDFServices(credentials=credentials)# Creates an asset(s) from source file(s) and uploadinput_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)# Create parameters for the jobextract_pdf_params = ExtractPDFParams(elements_to_extract=[ExtractElementType.TEXT],)# Creates a new job instanceextract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params)# Submit the job and gets the job resultlocation = pdf_services.submit(extract_pdf_job)pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult)# Get content from the resulting asset(s)result_asset: CloudAsset = pdf_services_response.get_result().get_resource()stream_asset: StreamAsset = pdf_services.get_content(result_asset)# Creates an output stream and copy stream asset's content to itoutput_file_path = "extractTextInfoFromPDF.zip"with open(output_file_path, "wb") as file:file.write(stream_asset.get_input_stream())except (ServiceApiException, ServiceUsageException, SdkException) as e:logging.exception(f"Exception encountered while executing operation: {e}")if __name__ == "__main__":ExtractTextInfoFromPDF()

