-- Add the module to the tree local mod = klhtm local me = {} mod.regex = me --[[ Regex.lua The Regex module converts printing formatted strings to parsing formatted strings, in a locale independent way. e.g. "Your %s hits %s for %d." -> {"Your (.+) hits (.+) for (%d+)%.", {1, 2, 3}} "Le %$3s de %$2s vous fait gagner %$1d points de vie." -> {"Le (.+) de (.+) vous fait gagner (%d+) points de vie%.", {3, 2, 1}} First a bit of background. We want to be able to read the combat log on all clients, whether the language is english or french or chinese or otherwise. Furthermore, we don't want to rely on localisers working out the parser strings manually, because there is a likelihood of human error, and it would take too long to get a new string added. Fortunately, we have all the information we need (at runtime, at least). For instance, in the example above, the value of the format string is given in the variable SPELLLOGSELFOTHER. If you open the GlobalStrings.lua (may need the WoW interface extractor to see it), on english clients you will see ... SPELLLOGSELFOTHER = "Your %s hits %s for %d." ... and on french clients you will see ... SPELLLOGSELFOTHER = "Le %$3s de %$2s vous fait gagner %$1d points de vie." ... When the WoW client is printing to the combat log, it will run a command like ChatFrame2:AddMessage(string.format(SPELLLOGSELFOTHER, "Mortal Strike", "Mottled Boar", 352)) So, at Runtime (that is, when the addon loads, but not when i am writing it - i only have the english values) the mod has access to all the printing string format variables, like SPELLLOGSELFOTHER. We have a list of all the important ones, for all the abilities that the mod needs, so we want to make a big parser to scan them all at runtime. So the first thing we do when the addon loads is create all these parsers, then use them for all our combat log parsing. ------------------------------------------------------------ Structures: 1) Small Parser: local parser = { ["formatstring"] = formatstring, "You hit %s for %s." ["regexstring"] = regexstring, "You hit (.+) for (.+)%." numarguments = me.numarguments, 2 ordering = me.ordering, {1, 2} argtypes = me.types, {"string", "number"} } Note that the values of matches the canonical ordering (1, 2, 3, ...), not the localised ordering as in . 2) Big Parser: local value = { ["parser"] = parser, a structure ["globalstring"] = globalstringname, COMBATHITSELFOTHER ["identifier"] = identifier, "whiteattackhit" } 3) Parser Set: First level is a key-value list. The keys are event names, e.g. "CHAT_MSG_SPELL_SELF_BUFF". The values are ordered lists of s. 4) Parser Output: local output = { hit = , temp = { }, list of up to 4 values, the captures with localised ordering final = { }, list of up to 4 values, the captures with canonical ordering } The idea is to reuse the structure, so the flag just records whether the last parse succeeded (non-nil for success). It is assumed that all parse strings have at most 4 arguments. 5) BigParser Output: same as , but has the property , which is a structure. ]] --[[ ------------------------------------------------------------------------------ Section A: Parsing a String With the Parser Engine ------------------------------------------------------------------------------ ]] -- this is returned from all calls to mod.regex.parse(). me.output = { hit = nil, temp = { }, final = { }, parser = nil, } --[[ mod.regex.parse(inputstring, event) Given a string, checks whether it matches any parser in the engine. The return value is a structure. is e.g. a line from your combat log to be parsed. is the event the string was received on, e.g. "CHAT_MSG_SPELL_SELF_BUFF" ]] me.parse = function(parserset, inputstring, event) -- 0) Reset output me.output.hit = nil -- 1) Check that the event is handled by the parser local parsersubset = parserset[event] if parsersubset == nil then return me.output end -- 2) Look for a parser local x, bigparser, y, parser for x, bigparser in parsersubset do parser = bigparser.parser if me.parsestring(parser, inputstring, me.output) then me.output.parser = bigparser -- verify numeric arguments for y = 1, parser.numarguments do if (parser.argtypes[y] == "number") and (tonumber(me.output.final[y]) == nil) then -- error occur! if mod.out.checktrace("error", me, "regex") then mod.out.printtrace(string.format("The value |cffffff00%s|r of argument %d is not a number as it should be! Parser = %s, format string = %s. Event = %s, string = %s.", me.output.final[y], y, bigparser.identifier, parser.formatstring, event, inputstring)) end break end end return me.output end end -- 3) No hit - oh well! return me.output end --[[ me.parsestring(parser, string, output) Parses a string with the specified parser. Returns non-nil if the string satisfies the parser is a parser structure, i.e. an output of me.formattoregex(). is the string to parse, e.g. a combat log line. is a structure to store the output. It must have .temp and .final properties which are lists. ]] me.parsestring = function(parser, inputstring, output) _, output.hit, output.temp[1], output.temp[2], output.temp[3], output.temp[4], output.temp[5] = string.find(inputstring, parser.regexstring) -- early exit on fail if output.hit == nil then return end -- now reorder arguments local x for x = 1, parser.numarguments do output.final[parser.ordering[x]] = output.temp[x] end return true end --[[ ------------------------------------------------------------------------------ Section B: Creating the Parser Engine at Startup ------------------------------------------------------------------------------ ]] --[[ me.addparsestring(parserset, indentifier, globalstringname, event) Adds a new parser to the parser set. is a key-value list, keyed by event names, values are a list of parsers listening to that event is a description of the capture, e.g. "spellcrit" is the name of the variable that holds for format pattern, e.g. "SPELLLOGHIT" is the event in which the capture comes, e.g. "CHAT_MESSAGE_SPELL_SELF_BUFF" ]] me.addparsestring = function(parserset, identifier, globalstringname, event) -- if there are no parsers on this event already, create a new list if parserset[event] == nil then parserset[event] = { } end -- get the value of the global string variable local formatstring = getglobal(globalstringname) if formatstring == nil then if mod.out.checktrace("error", me, "regex") then mod.out.printtrace(string.format("No global string %s found. ID = %s, event = %s.", globalstringname, identifier, event)) end return end -- convert to regex local parser = me.formattoregex(formatstring) if me.testparser(parser) == nil then if mod.out.checktrace("error", me, "regex") then mod.out.printtrace(string.format("parser failed on %s.", identifier)) end return end -- This is a parser structure, i guess. A big one, call it. local value = { ["parser"] = parser, ["globalstring"] = globalstringname, ["identifier"] = identifier, } -- ordered insert. If there are several parsers sharing the one event, we want to order them in such a way -- that no parser gets blocked by another, less specific parser. local length, x = table.getn(parserset[event]) if length == 0 then table.insert(parserset[event], value) else for x = 1, length do -- keep going until you are smaller than one of them if me.compareregexstrings(parserset[event][x].parser, parser) == 1 then -- our string is definitely higher table.insert(parserset[event], x, value) break elseif x == length then table.insert(parserset[event], value) end end end end --[[ me.formattoregex(formatstring) Returns a small parser structure from a print formatting string. is e.g. "You hit %s for %s.". The output describes how to convert this to a parser. ]] me.formattoregex = function(formatstring) --[[ gsub replaces all occurences of the first string with the second string. [%.%(%)] means all occurences of . or ( or ) %%%1 means replace these with a % and then itself. We're replacing them now so they don't interfere with the next bit. ]] local regexstring = string.gsub(formatstring, "([%.%(%)])", "%%%1") --[[ Formatting blocks have two types. If they arguments are in the same order as the english, the patterns will look like "%s %s %d %s" etc. If they have a different argument ordering, it would be e.g. "%3$s %1$d %2$s". So we need to check for both these circumstances ]] me.numarguments = 0 me.ordering = { } me.types = { } --[[ string.gsub will search the string regexstring, identify captures of the form "(%%(%d?)$?([sd]))", then replace them with the value me.gsubreplacement(). See me.gsubreplacement comments for more details. ]] regexstring = string.gsub(regexstring, "(%%(%d?)$?([sd]))", me.gsubreplacement) --[[ Adding a ^ character to the search string means that the string.find() is only allowed to match the test string starting at the first character. ]] regexstring = "^" .. regexstring local parser = { ["formatstring"] = formatstring, ["regexstring"] = regexstring, numarguments = me.numarguments, ordering = me.ordering, argtypes = me.types, } return parser end -- set in me.formattoregex: -- me.numarguments = 0 -- me.ordering = { } -- me.types = { } --[[ The round brackets in the format string "(%%(%d?)$?([sd]))" denote captures. They will be sent to the replacement function as arguments. Their order is the order of the open brackets. So the first argument is the entire string, e.g. "%3$s" or "%s", the second argument is the index, if supplied, e.g. "3" or nil, and the third argument is "s" or "d", i.e. whether the print format is a string or an integer. ]] me.gsubreplacement = function(totalstring, index, formattype) me.numarguments = me.numarguments + 1 -- set the index for strings that don't supply them by default (when ordering is 1, 2, 3, ...) index = tonumber(index) if index == nil then index = me.numarguments end table.insert(me.ordering, index) -- the return value is the actual replacement if formattype == "d" then me.types[index] = "number" return "(%d+)" else me.types[index] = "string" return "(.+)" end end --[[ me.compareregexstrings(regex1, regex2) We are given two strings, and we want to know in which order to check them. e.g. (1) "You gain (%d+) health from (.+)%." vs (2) "You gain (%d+) (.+) from (.+)%." In this case we should check for (1) first, then (2). To be more specific, 1) If one pattern goes to a capture and another goes to text, due the text first. 2) If both of them go to different texts, put the guy with the most captures first. Otherwise, the longest guy. 3) If both go to captures of differnt types, then don't worry. return values: -1: regex1 first +1: regex2 first Where possible, prefer to return -1. ]] me.compareregexstrings = function(parser1, parser2) local regex1, regex2 = parser1.regexstring, parser2.regexstring local start1, start2 = 1, 1 local token1, token2 while true do token1 = me.getnexttoken(regex1, start1) token2 = me.getnexttoken(regex2, start2) -- check for end of strings if token2 == nil then return -1 elseif token1 == nil then return 1 end -- check for equal (so far) if token1 == token2 then start1 = start1 + string.len(token1) start2 = start2 + string.len(token2) else break end end -- to get there, they have arrived at different tokens, therefore they must be orderable if string.len(token1) > 2 then -- regex1 is at a capture if string.len(token2) > 2 then -- regex2 is at a capture -- they are different, so one is a number, one a string, so who cares return -1 else -- prefer the non-capture first return 1 end else -- regex1 is not at a capture if string.len(token2) > 2 then -- regex2 at a capture return -1 else if string.find(string.sub(regex2, start2), string.sub(regex1, start1)) then return 1 end if true then return -1 end -- neither at a capture if parser1.numarguments < parser2.numarguments then return 1 elseif parser1.numarguments > parser2.numarguments then return -1 elseif string.len(regex1) >= string.len(regex2) then return -1 else return 1 end end end end --[[ me.getnexttoken(regex, start) Returns the next regex token in a string. is the regex string, e.g. "hello (.+)%." . is the 1-based index of the string to start from. Tokens are captures, e.g. "(.+)" or "(%d+)", or escaped characters, e.g. "%." or "%(", or normal letters, e.g. "a", ",". ]] me.getnexttoken = function(regex, start) if start > string.len(regex) then return nil end local char = string.sub(regex, start, start) if char == "%" then return string.sub(regex, start, start + 1) elseif char == "(" then char = string.sub(regex, start + 1, start + 1) if char == "%" then return string.sub(regex, start, start + 4) else return string.sub(regex, start, start + 3) end else return char end end --[[ ------------------------------------------------------------------------------ Section C: Testing the Regex System ------------------------------------------------------------------------------ ]] --[[ mod.regex.test() Checks that the parsers created from print format strings are working correctly, over a range of tough strings. Will print out the results. ]] me.test = function() strings = {"%3$s vous fait gagner %1$d %2$s.", "Votre %4$s inflige %2$d points de degats de %3$s a %1$s.", "Vous utilisez %s sur votre %s."} for x = 1, table.getn(strings) do if me.testformatstring(strings[x]) == nil then mod.out.print(string.format("test failed on string %d, '%s'.", x, strings[x])) return end end mod.out.print(string.format("all %d strings passed their tests.", table.getn(strings))) end --[[ me.testformatstring(value) Given a print formatting string, creates a parser for that string, and checks that the parser works correctly. is e.g. "You hit %s for %s." Returns: non-nil if the test succeeds. ]] me.testformatstring = function(value) local parser = me.formattoregex(value) -- debug a bit mod.out.print(string.format("Format string = |cffffff00%s|r, regex string = |cffffff00%s|r, numargs = |cffffff00%d|r.", parser.formatstring, parser.regexstring, parser.numarguments)) return me.testparser(parser) end --[[ me.testparser(parser, debug) Verifies experimentally that a parser matches its print format string. is a structure. is a flag, if non-nil come debugging will be printed. Returns: non-nil if the test succeeds. The method generates a random string that could be made from 's format string, then parses it with the parser, and checks that the captured values match the original arguments. ]] me.testparser = function(parser, debug) -- 1) Generate a random string that matches the format local arguments = { } local x for x = 1, parser.numarguments do if parser.argtypes[parser.ordering[x]] == "string" then arguments[parser.ordering[x]] = me.generaterandomstring() else arguments[parser.ordering[x]] = math.random(1000) end end -- debug print if debug then for x = 1, parser.numarguments do if arguments[x] == nil then mod.out.print("arg " .. x .. " is nil!") return end mod.out.print("arg" .. x .. " = " .. arguments[x]) end end local randomstring = string.format(parser.formatstring, unpack(arguments)) -- debug print if debug then mod.out.print("the test string = " .. randomstring) end -- try parse local output = { temp = { }, final = { }, } if me.parsestring(parser, randomstring, output) == nil then mod.out.print("The string did not parse.") return nil else -- debug print if debug then for x = 1, parser.numarguments do mod.out.print("output" .. x .. " = " .. output.final[x]) end end return true end end --[[ Generates a random string of capital letters and spaces. Will look something like "AJ WFDSO ECL SFOE". ]] me.generaterandomstring = function() local length = 10 + math.random(10) local x local value = "" for x = 1, length do if math.random(3) == 3 then value = value .. " " else value = value .. string.format("%c", 64 + math.random(26)) end end return value end