Parsing information from a PDF file

We often encounter interesting problem-solving opportunities when participating in Linkedin Iguana group discussions. Recently, we were asked if it was possible to retrieve data from a PDF file. During the discussions that followed, someone raised idea of using a third-party utility to convert PDF to XML.  I decided to come up with a way to invoke this method from within the Translator itself. The following steps summarize my solution:

  1. Create a channel for the incoming message and its PDF attachment.
  2. Write the PDF to a scratch directory.
  3. Invoke the third party conversion program to convert the PDF file into XML data.
  4. Use Iguana’s xml functions to parse and manipulate the XML data as desired.

Here is the specific procedure:

  1. Download and install an appropriate third-party conversion utility.For the code example below, we chose PDF to XML. (It may require LibXml2 from GNU/XMLSoft.)
  2. Create and configure a new channel with an LLP Listener source component and a To Translator destination component.
  3. Open the script and commit the first milestone.
  4. Copy and past the following code snippet into your script, or require it as an external module.

    Note: The purpose of each section in this script is identified with notes right in the code.

local pdf = {}

-- Online help
local pdfhelp = {
       Title="pdf.convert";
       Usage="local Xml, Images = pdf.convert{file='Doctor Report.pdf',}",
       Desc=[[Under the hood this function invokes the PDFtoXML utility. Please note: if the "images" option is set,
              it is the calling function's responsibility to clean up the generated image files.]];
       ["Returns"] = {
          {Desc="A XML node tree with the converted contents of the PDF document"},
          {Desc=[[Optionally a list of the images found in the document.  This can
                  be suppressed using the '-noImage' flag.]]}
       };
       ParameterTable= true,
       Parameters= {
           {file= {Desc='The source PDF file to convert.'}},
           {images= {Desc='This is an optional flag you can set to true to also extract images.'; Opt=true}},
       };
       Examples={
           "local Xml = pdf.convert{file='Doctor Report.pdf'}",
       };
       SeeAlso={
           {
               Title="Tips and tricks from John Verne",
               Link="http://wiki.interfaceware.com/1338.html"
           }
       }
   }

--
-- Helper functions
--

LUA_DIRSEP = string.sub(package.config, 1, 1)

-- Validate input. Returns true if we think we can continue.
local function validateOpts(T)
   file = T.file
   images = T.images

   if file == nil then
      return false, 'Missing required parameter "file"'
   end

   if file == '' or type(file) ~= 'string' then
      return false, 'Parameter "file" must be of type string and not empty.'
   end

   if images ~= nil and type(images) ~= 'boolean' then
      return false, 'Parameter "images" must be of type boolean'
   end

   return true
end

-- Invoke the third-party with the right options to create the
-- XML data for the given PDF file.
local function generateXMLoutput(pdffile, cmdOpts)
   -- Tricky regex to get all the parts of a Windows pathname.
   local path, vol, _, basename, _
      = pdffile:match("(([%a]:).-)(([^\\/]-%.?)([^%.\\/]*))$")

   if path == nil or vol == nil or basename == nil then
      return nil, nil, 'source file "' .. pdffile .. '" is not valid.'
   end

   -- Default locations.
   local xmlfile = path .. basename .. 'xml'
   local imagedir = xmlfile .. '_data'

   -- Run the command and get the output.
   local path, exe, _ = pdf.toolPath:match('(.-)([^\\/]-%.?([^%.\\/]*))$')
   local ret = io.popen(vol .. ' & cd "' .. path .. '" & '
      .. exe .. ' ' .. cmdOpts
      .. ' "' .. pdffile .. '" "' .. xmlfile .. '"'
      .. ' 2>&1')
   local data = ret:read("*a")

   -- Handle exception cases. Sometimes the data we get is
   -- data we want, and other times it is command error output.
   -- So, normalize the returns here.
   if data == nil then
      data = 'invocation of ' .. pdf.toolName .. ' failed with no message.'
      xmlfile = nil
   end

   return xmlfile, imagedir, data
end

-- For a given XML file on disk, read the contents and return them.
local function getXMLdata(xmlfile)
   local f = assert(io.open(xmlfile, 'rb'))
   local xmldata = f:read('*a')
   f:close()

   return xmldata
end

-- Return true if the boolean passed in is set to true.
-- Any other state is considering false.
local function isSet(flag)
   if flag == nil or flag == '' or not flag then
      return false
   else
      return true
   end
end

-- Get the list of JPEG images in the data directory
local function getImages(imageDir)
   local images = {}

   for FileName, _ in os.fs.glob(imageDir .. LUA_DIRSEP .. '*.jpg') do
      table.insert(images, FileName)
   end

   return images
end

-- Main conversion entry point.
function pdf.convert(T)
   -- Validate parameters
   valid, err = validateOpts(T)
   if not valid then
      error(err , 2)
   end

   -- Is the path to the conversion tool something reasonable?
   local toolPath = pdf.toolPath or ''
   if toolPath == '' then
      error('Path to conversion executable is bad or missing', 1)
   end

   -- Set/normalize options
   local opts = ''
   if not isSet(T.images) then
      opts = opts .. " -noImage "
   end

   -- Get the PDF data and metadata written out to an XML file.
   local xmlfile, imagedir, err = generateXMLoutput(T.file, opts)
   if not xmlfile then
      error('Could not generate XML file: ' .. err, 2)
   end

   -- Read the XML datafile and parse it. nil data
   -- returns an empty document.
   local xmlData = getXMLdata(xmlfile) or ''
   local document = xml.parse{data=xmlData}

   -- Load up the table of images created, if appropriate.
   local images = nil
   if isSet(T.images) then
      images = getImages(imagedir)
   end

   -- This is not critical, so ignore the return.
   os.remove(xmlfile)

   return document, images
end

-- Hook up the help to the functions
help.set{input_function=pdf.convert, help_data=pdfhelp}

return pdf

Implementation Notes

Note: PDF to XML is not under active development, and there is no source code for the project currently available. However, there are binaries available for Windows and Linux. Currently, there is no good free cross-platform solution for creating XML or XHTML data from PDF files.

The solution provided here is biased toward running Iguana on Windows. It would not be too difficult to modify the platform-specific functions to work on Linux.

Let us know if you find this tip useful!

John Verne
Senior iNTERFACEWARE Developer