rtf.lua
Verified Featured
Added by iNTERFACEWARE
A module for converting a RTF file to plain text.
Source Code
local rtf={}
-- RTF module - used to convert RTF document into plain text.
-- http://help.interfaceware.com/code/details/rtf-lua
-- A simple set implementation in Lua
local function Set (list)
local set = {}
for _, l in ipairs(list) do set[l] = true end
return set
end
-- Which control sequences to ignore
local IgnoreSet = Set { 'info', 'fonttbl', 'colortbl', 'stylesheet', '*' }
local Out = '';
local StateStack = {};
local State;
-- States of the state machine
local PLAINTEXT = 1
local CONTROL = 2
local ARGUMENT = 3
local BACKSLASH = 4
local ESCAPED_CHAR = 5
-- Character destinationions
local USE = 0
local IGNORE = 1
-- To print an exception only the first time
local UnexpectedCharFound = false;
-- Sets current character destination (IGNORE or USE)
local function setDest (D)
Dest = D
end
local function pushState ()
table.insert (StateStack, { ['Dest'] = Dest } )
end
local function popState ()
local EL = table.remove (StateStack)
setDest (EL['Dest'])
end
-- Collect or ignore the character based on the current destination
local function putChar (C, B)
if C == '\r' then
C = '\n'
end
if Dest ~= IGNORE then
Out = Out..C
end
end
local function isAlpha (C)
return string.match (C, "%a") ~= nil
end
local function isDigit (C)
return string.match (C, "%d") ~= nil
end
local function isSpace (C)
return string.match (C, "%s") ~= nil
end
-- Process an RTF control word
-- T is token
-- A is argument
local function doControl (T, A)
if T == 'par' then
putChar ('\n')
elseif IgnoreSet[T] then
setDest (IGNORE)
end
end
local function feedChar (C, B)
local function nextState (C, B, CheckSpace)
if C == '\\' then
State = BACKSLASH
elseif C == '{' then
pushState ()
elseif C == '}' then
popState ()
else
if not CheckSpace or not isSpace (C) then
putChar (C, B)
end
end
end
if State == PLAINTEXT then
nextState (C, B, false)
elseif State == BACKSLASH then
if C == '\\' or C == '{' or C == '}' then
putChar (C)
State = PLAINTEXT
else
if isAlpha (C) or C == '*' or C == '-' or C == '|' then
State = CONTROL
Token = C
elseif C == "'" then
State = ESCAPED_CHAR
EscapedChar = ''
elseif C == '\\' or C == '{' or C == '}' then
putChar (C)
State = PLAINTEXT
elseif C == '~' then
putChar (' ')
state = PLAINTEXT
else
if (UnexpectedCharFound ~= true) then
print ('Exception: unxepected '..C..' after \\')
UnexpectedCharFound = true
end
end
end
elseif State == ESCAPED_CHAR then
EscapedChar = EscapedChar..C
if #EscapedChar == 2 then
C = string.char (tonumber (EscapedChar, 16))
putChar (C)
State = PLAINTEXT
end
elseif State == CONTROL then
if isAlpha (C) then
Token = Token..C
elseif isDigit (C) or C == '-' then
State = ARGUMENT
Arg = C
else
doControl (Token, Arg)
State = PLAINTEXT
nextState (C, B, true)
end
elseif State == ARGUMENT then
if isDigit (C) then
Arg = Arg .. C
else
State = PLAINTEXT
doControl (Token, Arg)
nextState (C, B, true)
end
end
end
--
-- Public API
--
-- Given an RTF document as a Lua string (Data), return the text
-- portion of the document.
function rtf.toTxt(Data)
Out = ''
StateStack = { }
State = PLAINTEXT
setDest (USE)
Data = string.gsub (Data, '\r', '')
Data = string.gsub (Data, '\n', '')
local i = 1
for i = 1, #Data do
local B = string.byte (Data, i)
feedChar (string.char (B), B)
end
return Out
end
local rtf_toTxt = {
Title="rtf.toTxt";
Usage="rtf.toTxt(rtf)",
SummaryLine="Converts an RTF to plain text.",
Desc=[[Converts an RTF to plain text removing all formatting in the process.
<p>The rtf.toTxt() function takes an RTF document, and strips out all the
formatting and non-text objects to produce plain text.
]];
["Returns"] = {
{Desc="RTF as plain with all formatting removed <u>string</u>."},
};
ParameterTable= true,
Parameters= {
{rtf= {Desc='A string containing RTF data <u>string</u>.'}},
};
Examples={
[[ -- Read an rtf file into the Content string variable
local FileName = iguana.project.root() ..'/'..iguana.project.guid()..'/sample.rtf'
local F = io.open(FileName,'r')
local Content = F:read('*a')
F:close()
-- Now convert the content into plain text.
local Text = rtf.toTxt(Content)
trace(Text)
-- Of course formatting is lost...
]],
};
SeeAlso={
{
Title="rtf.lua - in our code repository",
Link="http://help.interfaceware.com/code/details/rtf-lua"
},
{
Title="RTF conversion example",
Link="http://help.interfaceware.com/v6/rtf-conversion-example"
}
}
}
help.set{input_function=rtf.toTxt, help_data=rtf_toTxt}
return rtf
Description
A module for converting a RTF file to plain text.
Attachments
Usage Details
The code contains an rtf.toTxt() function that takes an RTF document, and strips out all the formatting and non-text objects to produce plain text.
How to use rtf.lua:
- Create a new shared module called “rtf” and paste in the code above
- Add local rtf = require ‘rtf’ at the top of the main module
- Test using the attached sample.rtf, or your own rtf file
Example code for main:
local rtf = require 'rtf'
-- Simple module to convert RTF file into text.
function main()
io.input('sample.rtf')
Rtf=io.read("*all")
-- Now convert the content into plain text.
local text=rtf.toTxt(Rtf)
trace(text)
-- Of course formatting is lost...
end
More Information