Introduction
Converting from RTF with the Translator is straightforward. The following example shows a function that takes an RTF document, and strips out all the formatting and non-text objects to produce plain text.
As you can see it is as simple as loading the file and making a single function call.
Note: The main()
function uses print()
to log messages. When an Iguana channel is run print()
messages are logged as informational messages. For more sophisticated logging you can use the iguana.logInfo()
, iguana.logWarning()
, iguana.logDebug()
functions.
Source Code [top]
Here is the source code for the rtf module. To use it:
- Create a new shared module called “rtf” and copy paste in the code below.
- Add the code require(‘rtf’) at the top of the main module.
- Test using sample.rtf, or your own rtf file.
Code for main:
local rtf = require('rtf') function main() io.input('sample.rtf') Rtf=io.read("*all") local text=rtf.toTxt(Rtf) trace(text) end
Code for rtf module:
local rtf={} -- -- Implementation -- -- A simple set implementation in Lua local function Set (list) local set = {} for _, l in ipairs(list) do set[l] = true end return set end -- Which control sequences to ignore local IgnoreSet = Set { 'info', 'fonttbl', 'colortbl', 'stylesheet', '*' } local Out = ''; local StateStack = {}; local State; -- States of the state machine local PLAINTEXT = 1 local CONTROL = 2 local ARGUMENT = 3 local BACKSLASH = 4 local ESCAPED_CHAR = 5 -- Character destinationions local USE = 0 local IGNORE = 1 -- To print an exception only the first time local UnexpectedCharFound = false; -- Sets current character destination (IGNORE or USE) local function setDest (D) Dest = D end local function pushState () table.insert (StateStack, { ['Dest'] = Dest } ) end local function popState () local EL = table.remove (StateStack) setDest (EL['Dest']) end -- Collect or ignore the character based on the current destination local function putChar (C, B) if C == '\r' then C = '\n' end if Dest ~= IGNORE then Out = Out..C end end local function isAlpha (C) return string.match (C, "%a") ~= nil end local function isDigit (C) return string.match (C, "%d") ~= nil end local function isSpace (C) return string.match (C, "%s") ~= nil end -- Process an RTF control word -- T is token -- A is argument local function doControl (T, A) if T == 'par' then putChar ('\n') elseif IgnoreSet[T] then setDest (IGNORE) end end local function feedChar (C, B) local function nextState (C, B, CheckSpace) if C == '\\' then State = BACKSLASH elseif C == '{' then pushState () elseif C == '}' then popState () else if not CheckSpace or not isSpace (C) then putChar (C, B) end end end if State == PLAINTEXT then nextState (C, B, false) elseif State == BACKSLASH then if C == '\\' or C == '{' or C == '}' then putChar (C) State = PLAINTEXT else if isAlpha (C) or C == '*' or C == '-' or C == '|' then State = CONTROL Token = C elseif C == "'" then State = ESCAPED_CHAR EscapedChar = '' elseif C == '\\' or C == '{' or C == '}' then putChar (C) State = PLAINTEXT elseif C == '~' then putChar (' ') state = PLAINTEXT else if (UnexpectedCharFound ~= true) then print ('Exception: unxepected '..C..' after \\') UnexpectedCharFound = true end end end elseif State == ESCAPED_CHAR then EscapedChar = EscapedChar..C if #EscapedChar == 2 then C = string.char (tonumber (EscapedChar, 16)) putChar (C) State = PLAINTEXT end elseif State == CONTROL then if isAlpha (C) then Token = Token..C elseif isDigit (C) or C == '-' then State = ARGUMENT Arg = C else doControl (Token, Arg) State = PLAINTEXT nextState (C, B, true) end elseif State == ARGUMENT then if isDigit (C) then Arg = Arg .. C else State = PLAINTEXT doControl (Token, Arg) nextState (C, B, true) end end end -- -- Public API -- -- Given an RTF document as a Lua string (Data), return the text -- portion of the document. function rtf.toTxt(Data) Out = '' StateStack = { } State = PLAINTEXT setDest (USE) Data = string.gsub (Data, '\r', '') Data = string.gsub (Data, '\n', '') local i = 1 for i = 1, #Data do local B = string.byte (Data, i) feedChar (string.char (B), B) end return Out end return rtf