Get displayed size of an image in a pdf with PDFBox
I am trying to compute the percentage images within a PDF "occupy" on each page. I have the following code:
PDPageTree list = document.getPages();
int pageNumber = 0;
float imagePerPage = 0;
for (PDPage page : list) {
BufferedImage pageImage = renderer.renderImage(pageNumber, 2);
float pageWidth = pageImage.getWidth();
float pageHeight = pageImage.getHeight();
PDResources pdResources = page.getResources();
int i = 1;
for (COSName name : pdResources.getXObjectNames()) {
PDXObject object = pdResources.getXObject(name);
if (object instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject) object;
BufferedImage bufferedImage = image.getImage();
float imageWidth = bufferedImage.getWidth();
float imageHeight = bufferedImage.getHeight();
int sumr = 0;
int sumg = 0;
int sumb = 0;
for (int x = 0; x < imageWidth; x++) {
for (int y = 0; y < imageHeight; y++) {
Color pixel = new Color(bufferedImage.getRGB(x, y));
sumr += pixel.getRed();
sumg += pixel.getGreen();
sumb += pixel.getBlue();
}
}
int num = image.getWidth() * image.getHeight();
Color avg = new Color(sumr / num, sumg / num, sumb / num);
if (!new Color(0, 0, 0).equals(avg)) {
String filename = "extracted-image-" + i + ".png";
ImageIO.write(
image.getImage(),
"png",
new File(filename)
);
imagePerPage++;
}
i++;
}
}
System.out.println("Image per page ratio is: " + imagePerPage);
imagePerPage = 0;
pageNumber++;
}
However, bufferedImage.getWidth() and bufferedImage.getHeight() return the actual size of the image in pixels. How could I get the displayed size for each image?
UPDATE 1
I have tried using PrintImageLocations.java example for retrieving display image sizes. However, in case of an actual pdf, it seems to give wrong responses.
In case of this PDF whose mediabox has 612 in width and 792 as height, the scaled numbers for the images of 25 (as width) and 16.61 (as height) do not seem to be correct. After all, each image has at least a third of the total width.
did something useful for me using example present on pdfbox documentation site
package br.gov.pb.mp.framework.util.pdf;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
public class ImageSizeExtractor extends PDFStreamEngine {
private Map<String, float[]> pageImages;
public ImageSizeExtractor() throws IOException {
// preparing PDFStreamEngine
addOperator(new Concatenate());
addOperator(new DrawObject());
addOperator(new SetGraphicsStateParameters());
addOperator(new Save());
addOperator(new Restore());
addOperator(new SetMatrix());
pageImages = new HashMap<String, float[]>();
}
public Map<String, float[]> getPageImages() {
return pageImages;
}
public void setPageImages(Map<String, float[]> pageImages) {
this.pageImages = pageImages;
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
String operation = operator.getName();
if ("Do".equals(operation)) {
COSName objectName = (COSName) operands.get(0);
// get the PDF object
PDXObject xobject = getResources().getXObject(objectName);
// check if the object is an image object
if (xobject instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject) xobject;
int imageWidth = image.getWidth();
int imageHeight = image.getHeight();
System.out.println("\nImage [" + objectName.getName() + "]");
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
float imageXScale = ctmNew.getScalingFactorX();
float imageYScale = ctmNew.getScalingFactorY();
System.out.println("displayed size = " + imageXScale + ", " + imageYScale + " in user space units");
float[] xy = {imageXScale,imageYScale};
pageImages.put(objectName.getName(), xy);
} else if (xobject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) xobject;
showForm(form);
}
} else {
super.processOperator(operator, operands);
}
}
}
method that makes use of the above class.
public static boolean analyseImageEntirePagePdfAto(byte[] sourcePdf) throws Throwable {
boolean containsEntirePageImage = false;
PDDocument docAto = PDDocument.load(sourcePdf);
int p = 0;
PDPageTree pageTree = docAto.getPages();
if (!containsEntirePageImage) {
for (PDPage pagina : pageTree) {
p++;
PDFTextStripper reader = new PDFTextStripper();
reader.setStartPage(p);
reader.setEndPage(p);
String pageText = reader.getText(docAto);
pageText = pageText.replaceAll("\r\n", "");
if (pageText == "" || pageText == null) {
containsEntirePageImage = true;
break;
}
float ph = pagina.getMediaBox().getHeight();
float pw = pagina.getMediaBox().getWidth();
float pageArea = ph * pw;
ImageSizeExtractor imageSizeExtractor = new ImageSizeExtractor();
imageSizeExtractor.processPage(pagina);
if (!imageSizeExtractor.getPageImages().entrySet().isEmpty()) {
for (Map.Entry<String, float[]> entry : imageSizeExtractor.getPageImages().entrySet()) {
float[] imageMeasures = entry.getValue();
float imageArea = imageMeasures[0] * imageMeasures[1];
float imgPercent = (imageArea / pageArea) * 100;
if (imgPercent > 80) {
containsEntirePageImage = true;
break;
}
}
}
}
}
return containsEntirePageImage;
}
the processPage(PDPage page) method is in the PDFStreamEngine class