We often encounter interesting problem-solving opportunities when participating in Linkedin Iguana group discussions. Recently, we were asked if it was possible to retrieve data from a PDF file. During the discussions that followed, someone raised idea of using a third-party utility to convert PDF to XML. I decided to come up with a way to invoke this method from within the Translator itself. The following steps summarize my solution:
- Create a channel for the incoming message and its PDF attachment.
- Write the PDF to a scratch directory.
- Invoke the third party conversion program to convert the PDF file into XML data.
- Use Iguana’s
xml
functions to parse and manipulate the XML data as desired.
Here is the specific procedure:
- Download and install an appropriate third-party conversion utility.For the code example below, we chose PDF to XML. (It may require LibXml2 from GNU/XMLSoft.)
- Create and configure a new channel with an LLP Listener source component and a To Translator destination component.
- Open the script and commit the first milestone.
- Copy and past the following code snippet into your script, or
require
it as an external module.Note: The purpose of each section in this script is identified with notes right in the code.
local pdf = {} -- Online help local pdfhelp = { Title="pdf.convert"; Usage="local Xml, Images = pdf.convert{file='Doctor Report.pdf',}", Desc=[[Under the hood this function invokes the PDFtoXML utility. Please note: if the "images" option is set, it is the calling function's responsibility to clean up the generated image files.]]; ["Returns"] = { {Desc="A XML node tree with the converted contents of the PDF document"}, {Desc=[[Optionally a list of the images found in the document. This can be suppressed using the '-noImage' flag.]]} }; ParameterTable= true, Parameters= { {file= {Desc='The source PDF file to convert.'}}, {images= {Desc='This is an optional flag you can set to true to also extract images.'; Opt=true}}, }; Examples={ "local Xml = pdf.convert{file='Doctor Report.pdf'}", }; SeeAlso={ { Title="Tips and tricks from John Verne", Link="http://wiki.interfaceware.com/1338.html" } } } -- -- Helper functions -- LUA_DIRSEP = string.sub(package.config, 1, 1) -- Validate input. Returns true if we think we can continue. local function validateOpts(T) file = T.file images = T.images if file == nil then return false, 'Missing required parameter "file"' end if file == '' or type(file) ~= 'string' then return false, 'Parameter "file" must be of type string and not empty.' end if images ~= nil and type(images) ~= 'boolean' then return false, 'Parameter "images" must be of type boolean' end return true end -- Invoke the third-party with the right options to create the -- XML data for the given PDF file. local function generateXMLoutput(pdffile, cmdOpts) -- Tricky regex to get all the parts of a Windows pathname. local path, vol, _, basename, _ = pdffile:match("(([%a]:).-)(([^\\/]-%.?)([^%.\\/]*))$") if path == nil or vol == nil or basename == nil then return nil, nil, 'source file "' .. pdffile .. '" is not valid.' end -- Default locations. local xmlfile = path .. basename .. 'xml' local imagedir = xmlfile .. '_data' -- Run the command and get the output. local path, exe, _ = pdf.toolPath:match('(.-)([^\\/]-%.?([^%.\\/]*))$') local ret = io.popen(vol .. ' & cd "' .. path .. '" & ' .. exe .. ' ' .. cmdOpts .. ' "' .. pdffile .. '" "' .. xmlfile .. '"' .. ' 2>&1') local data = ret:read("*a") -- Handle exception cases. Sometimes the data we get is -- data we want, and other times it is command error output. -- So, normalize the returns here. if data == nil then data = 'invocation of ' .. pdf.toolName .. ' failed with no message.' xmlfile = nil end return xmlfile, imagedir, data end -- For a given XML file on disk, read the contents and return them. local function getXMLdata(xmlfile) local f = assert(io.open(xmlfile, 'rb')) local xmldata = f:read('*a') f:close() return xmldata end -- Return true if the boolean passed in is set to true. -- Any other state is considering false. local function isSet(flag) if flag == nil or flag == '' or not flag then return false else return true end end -- Get the list of JPEG images in the data directory local function getImages(imageDir) local images = {} for FileName, _ in os.fs.glob(imageDir .. LUA_DIRSEP .. '*.jpg') do table.insert(images, FileName) end return images end -- Main conversion entry point. function pdf.convert(T) -- Validate parameters valid, err = validateOpts(T) if not valid then error(err , 2) end -- Is the path to the conversion tool something reasonable? local toolPath = pdf.toolPath or '' if toolPath == '' then error('Path to conversion executable is bad or missing', 1) end -- Set/normalize options local opts = '' if not isSet(T.images) then opts = opts .. " -noImage " end -- Get the PDF data and metadata written out to an XML file. local xmlfile, imagedir, err = generateXMLoutput(T.file, opts) if not xmlfile then error('Could not generate XML file: ' .. err, 2) end -- Read the XML datafile and parse it. nil data -- returns an empty document. local xmlData = getXMLdata(xmlfile) or '' local document = xml.parse{data=xmlData} -- Load up the table of images created, if appropriate. local images = nil if isSet(T.images) then images = getImages(imagedir) end -- This is not critical, so ignore the return. os.remove(xmlfile) return document, images end -- Hook up the help to the functions help.set{input_function=pdf.convert, help_data=pdfhelp} return pdf
Implementation Notes
Note: PDF to XML is not under active development, and there is no source code for the project currently available. However, there are binaries available for Windows and Linux. Currently, there is no good free cross-platform solution for creating XML or XHTML data from PDF files.
The solution provided here is biased toward running Iguana on Windows. It would not be too difficult to modify the platform-specific functions to work on Linux.
Let us know if you find this tip useful!