pdf2rdf.java

//default package

import java.io.FileInputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;

import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.util.URIref;
import com.hp.hpl.jena.vocabulary.DC;
import com.hp.hpl.jena.vocabulary.DCTerms;
import com.hp.hpl.jena.vocabulary.DCTypes;
import com.hp.hpl.jena.vocabulary.RDF;

/**
 * quick'n'dirty PDF to RDF converter
 * 
 * @author mimas@mimas.ceti.pl
 * @version 1.0
 * 
 * @see <a href="http://www.pdfbox.org/">PDFBox - Java PDF Library</a>
 * @see <a href="http://jena.sourceforge.net/">Jena - Semantic Web Framework</a>
 */
public class PDF2RDF {

  public static void main(String[] args) throws Exception {
    if (args.length < 1) {
      System.out.println( "Usage: java PDF2RDF <pdf-file-name>" );
      System.exit(1);
    }

    Model model = ModelFactory.createDefaultModel();
    new PDF2RDF().addPDFResource(model, args[0]);
    
    // w ramach testu wyświetlam RDF odczytany z PDF
    // w zoptymalizowanej notacji RDF/XML
    System.out.println("*** RDF/XML:\n");
    model.write(System.out, "RDF/XML-ABBREV");
    // i w notacji N3
    System.out.println("\n*** N3:\n");
    model.write(System.out, "N3");
  }

  /**
   * @param model Model RDF, do którego dodajemy zasób PDF 
   * @param filename Nazwa pliku PDF-a
   */
  public Resource addPDFResource(Model model, String filename) throws IOException {
    Resource pdf = null;
    PDDocument document = null;
    FileInputStream file = null;
    try {
      file = new FileInputStream(filename);
      PDFParser parser = new PDFParser(file);
      parser.parse();
      document = parser.getPDDocument();

      String nsPdf = "http://ns.adobe.com/pdf/1.3/";

      model.setNsPrefix("rdf", RDF.getURI());
      model.setNsPrefix("dc", DC.NS);
      model.setNsPrefix("dct", DCTerms.NS);
      model.setNsPrefix("pdf", nsPdf);

      pdf = model.createResource(URIref.encode("file://" + filename));
      pdf.addProperty(DC.type, DCTypes.Text);
      pdf.addProperty(DC.format, "application/pdf" );
      
      if (document.isEncrypted()) {
        try {
          document.decrypt("");
        } catch (Exception e) {
          System.err.println("Error: Failed to decrypt document.");
          return pdf;
        }
      }

      PDDocumentInformation info = document.getDocumentInformation();
      addProperty(model, pdf, nsPdf+"Title", info.getTitle());
      addProperty(model, pdf, nsPdf+"Subject", info.getSubject());
      addProperty(model, pdf, nsPdf+"Author", info.getAuthor());
      addProperty(model, pdf, nsPdf+"CreationDate", formatDate(info.getCreationDate()));
      addProperty(model, pdf, nsPdf+"ModDate", formatDate(info.getModificationDate()));
      addProperty(model, pdf, nsPdf+"Creator", info.getCreator());
      addProperty(model, pdf, nsPdf+"Producer", info.getProducer());
      addProperty(model, pdf, nsPdf+"Keywords", info.getKeywords());
      if ( info.getKeywords() != null )
      {
        String[] k = info.getKeywords().split(" ");
        for (int i = 0; i < k.length; i++) {
          pdf.addProperty(DC.subject, k[i]);
        }
      }

    } finally {
      if (file != null) {
        file.close();
      }
      if (document != null) {
        document.close();
      }
    }
    return pdf;
  }
  
  private void addProperty(Model m, Resource r, String p, String s) {
    if ( s != null )
      r.addProperty(m.createProperty(p), s);    
  }

  private String formatDate(Calendar date) {
    String retval = null;
    if (date != null) {
      SimpleDateFormat formatter = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss");
      retval = formatter.format(date.getTime());
    }
    return retval;
  }

}