How To Extract Text And Image From PDF In Java Applications

Add Dependencies

 
Before we go to the coding section, we need to add Spire.PDF to our Java project. There are two ways to add the needed dependencies to our project.
 
For Maven projects, we need to add the following code to our project’s pom.xml file.
  1. <repositories>  
  2.     <repository>  
  3.         <id>com.e-iceblue</id>  
  4.         <name>e-iceblue</name>  
  5.         <url>http://repo.e-iceblue.com/nexus/content/groups/public/</url>  
  6.     </repository>  
  7. </repositories>  
  8. <dependencies>  
  9.     <dependency>  
  10.         <groupId>e-iceblue</groupId>  
  11.         <artifactId>spire.pdf.free</artifactId>  
  12.         <version>3.9.0</version>  
  13.     </dependency>  
  14. </dependencies>  
For non-maven projects, download Free Spire.PDF for Java pack from this website and add Spire.Pdf.jar in the lib folder into our project as a dependency.
 
Example 1. Extract all texts from the whole PDF
 
Spire.PDF for Java library provides thepage.extractText() method that can be used to extract text from each page of a PDF document. In the following example, you will learn how to extract all text from a single PDF documentby using this API.
  1. import com.spire.pdf.*;  
  2. import com.spire.pdf.PdfPageBase;  
  3. import java.io.*;  
  4. public class extractAllTexts {  
  5.     public static void main(String[] args) throws Exception {  
  6.         String input = "Sample.pdf";  
  7.         //Load the PDF file  
  8.         PdfDocument pdf = new PdfDocument();  
  9.         pdf.loadFromFile(input);  
  10.         //Create a new txt file to save the extracted text  
  11.         String result = "output/extractAllText.txt";  
  12.         File file = new File(result);  
  13.         if (!file.exists()) {  
  14.             file.delete();  
  15.         }  
  16.         file.createNewFile();  
  17.         FileWriter fw = new FileWriter(file, true);  
  18.         BufferedWriter bw = new BufferedWriter(fw);  
  19.         //Extract text from all the pages on the PDF  
  20.         PdfPageBase page;  
  21.         for (int i = 0; i < pdf.getPages().getCount(); i++) {  
  22.             page = pdf.getPages().get(i);  
  23.             String text = page.extractText(true);  
  24.             bw.write(text);  
  25.         }  
  26.         bw.flush();  
  27.         bw.close();  
  28.         fw.close();  
  29.     }  
  30. }  
Example 2. Extract text from specific area
 
Spire.PDF for Java enables developers to extract text from the specific area from a PDF page by using page.extractText(new Rectangle2D.Float(80, 200, 500, 200)) method.
  1. import com.spire.pdf.*;  
  2. import java.awt.geom.Rectangle2D;  
  3. import java.io.*;  
  4. public class extractTextFromSpecificArea {  
  5.     public static void main(String[] args) throws Exception {  
  6.         String input = "Sample.pdf";  
  7.         //Load the PDF file  
  8.         PdfDocument pdf = new PdfDocument();  
  9.         pdf.loadFromFile(input);  
  10.         //Create a new txt file to save the extracted text  
  11.         String result = "output/extractText.txt";  
  12.         File file = new File(result);  
  13.         if (!file.exists()) {  
  14.             file.delete();  
  15.         }  
  16.         file.createNewFile();  
  17.         FileWriter fw = new FileWriter(file, true);  
  18.         BufferedWriter bw = new BufferedWriter(fw);  
  19.         //Get the first page  
  20.         PdfPageBase page = pdf.getPages().get(0);  
  21.         //Extract text from a specific rectangle area within the page  
  22.         String text = page.extractText(new Rectangle2D.Float(80, 200, 500, 200));  
  23.         bw.write(text);  
  24.         bw.flush();  
  25.         bw.close();  
  26.         fw.close();  
  27.     }  
  28. }  
Example 3. Extract highlighted text from PDF
 
Some PDFs will add the highlighted color for some texts. Spire.PDF offers a page.extractText(textMarkupAnnotation.getBounds())method to extract the highlighted text from the PDF.
  1. import com.spire.pdf.*;  
  2. import java.io.*;  
  3. import com.spire.pdf.annotations.*;  
  4. import com.spire.pdf.graphics.*;  
  5. public class extractHighlightedText {  
  6.     public static void main(String[] args) throws Exception {  
  7.         String input = "Sample.pdf";  
  8.         //Load the PDF file  
  9.         PdfDocument pdf = new PdfDocument();  
  10.         pdf.loadFromFile(input);  
  11.         //Create a new txt file to save the extracted text  
  12.         String result = "output/extractText1.txt";  
  13.         File file = new File(result);  
  14.         if (!file.exists()) {  
  15.             file.delete();  
  16.         }  
  17.         file.createNewFile();  
  18.         FileWriter fw = new FileWriter(file, true);  
  19.         BufferedWriter bw = new BufferedWriter(fw);  
  20.         bw.write("Extracted highlighted text:");  
  21.         PdfPageBase page = pdf.getPages().get(0);  
  22.         for (int i = 0; i < page.getAnnotationsWidget().getCount(); i++) {  
  23.             if (page.getAnnotationsWidget().get(i) instanceof PdfTextMarkupAnnotationWidget) {  
  24.                 PdfTextMarkupAnnotationWidget textMarkupAnnotation = (PdfTextMarkupAnnotationWidget) page.getAnnotationsWidget().get(i);  
  25.                 bw.write(page.extractText(textMarkupAnnotation.getBounds()));  
  26.                 //Get the highlighted color  
  27.                 PdfRGBColor color = textMarkupAnnotation.getColor();  
  28.                 bw.write("Color=" + (color.getR() & 0XFF) + "," + (color.getG() & 0XFF) + "," + (color.getB() & 0XFF) + "\n");  
  29.             }  
  30.         }  
  31.         bw.flush();  
  32.         bw.close();  
  33.         fw.close();  
  34.     }  
  35. }  
Example 4. Extract images from PDF
 
Spire.PDF for Java offers a page.extractImages() method to extract images from the PDF file.
  1. import com.spire.pdf.*;  
  2. import javax.imageio.ImageIO;  
  3. import java.awt.image.BufferedImage;  
  4. import java.io.*;  
  5. import java.util.ArrayList;  
  6. public class extractImages {  
  7.     public static void main(String[] args) throws Exception {  
  8.         //Load the PDF File  
  9.         PdfDocument doc = new PdfDocument();  
  10.         doc.loadFromFile("Sample.pdf");  
  11.         StringBuilder buffer = new StringBuilder();  
  12.         ArrayList < BufferedImage > images = new ArrayList < BufferedImage > ();  
  13.         //loop through the pages  
  14.         for (PdfPageBase page: (Iterable < PdfPageBase > ) doc.getPages()) {  
  15.             //extract images from a particular page  
  16.             for (BufferedImage image: page.extractImages()) {  
  17.                 //declare an int variable  
  18.                 int index = 0;  
  19.                 //specify the file path and name  
  20.                 File output = new File("output/" + String.format("Image_%d.png", index++));  
  21.                 //save image as .png file  
  22.                 ImageIO.write(image, "PNG", output);  
  23.             }  
  24.         }  
  25.     }  
  26. }  


Similar Articles