(* -*- mode: sml -*- * $Id: regexmatch.smlnj,v 1.1 2001/07/10 13:04:21 doug Exp $ * http://www.bagley.org/~doug/shootout/ * from Stephen Weeks * "ported" to SML/NJ * with help from Daniel Wang *) structure Test : sig val main : (string * string list) -> OS.Process.status end = struct fun ++ (r: int ref): int = let val n = 1 + !r val _ = r := n in n end structure Int = struct open Int type t = int fun exists (start: t, stop: t, f: t -> bool): bool = let fun loop i = i < stop andalso (f i orelse loop (i + 1)) in loop start end fun forall (start, stop, f) = not (exists (start, stop, not o f)) fun fold (start: t, stop: t, a: 'a, f: t * 'a -> 'a): 'a = let fun loop (i: t, a: 'a) = if i >= stop then a else loop (i + 1, f (i, a)) in loop (start, a) end fun for (start: t, stop: t, f: t -> unit): unit = let fun loop i = if i >= stop then () else (f i; loop (i + 1)) in loop start end end structure Array2 = struct open Array2 type 'a t = 'a array val new = array fun tabulate (r, c, f) = Array2.tabulate RowMajor (r, c, f) fun foreachi (a, f) = appi RowMajor f {base = a, row = 0, col = 0, nrows = NONE, ncols = NONE} end structure Vector = struct open Vector fun exists (v, f) = Int.exists (0, length v, fn i => f (sub (v, i))) fun foreach (v, f) = app f v fun foreachi (v, f) = appi f (v, 0, NONE) fun new (n, x) = tabulate (n, fn _ => x) fun new1 x = new (1, x) end structure List = struct open List fun foreach (l, f) = app f l fun fold (l, b, f) = foldl f b l fun appendRev (l1, l2) = fold (l1, l2, op ::) fun push (r, x) = r := x :: !r fun keepAll (l, f) = filter f l fun peek (l, f) = find f l fun insert (l, x, op <=) = let fun loop (l, ac) = case l of [] => appendRev (ac, [x]) | x' :: l' => if x <= x' then appendRev (ac, x :: l) else loop (l', x' :: ac) in loop (l, []) end end structure Array = struct open Array val new = array fun modify (a, f) = Array.modify f a fun foreachi (a, f) = appi f (a, 0, NONE) fun indices (a: bool array): int vector = let val n = Array.length a val numTrue = let fun loop (i, count) = if i = n then count else loop (i + 1, if Array.sub (a, i) then count + 1 else count) in loop (0, 0) end val next = ref 0 fun loop i = if Array.sub (a, i) then (next := i + 1; i) else loop (i + 1) in Vector.tabulate (numTrue, fn _ => loop (!next)) end end structure Char = struct open Char val fromInt = chr val toInt = ord end structure String = struct open String type t = string fun contains (s: t, c: char): bool = Int.exists (0, size s, fn i => c = sub (s, i)) end val numChars: int = 128 structure Regexp = struct datatype t = AnchorStart | CharSet of char -> bool | Or of t list | Seq of t list | Star of t end structure Stack: sig type 'a t val clear: 'a t -> unit val exists: 'a t * ('a -> bool) -> bool val foreach: 'a t * ('a -> unit) -> unit val new: int * 'a -> 'a t val push: 'a t * 'a -> unit end = struct datatype 'a t = T of {elts: 'a array, size: int ref} fun new (size: int, dummy: 'a): 'a t = T {elts = Array.new (size, dummy), size = ref 0} fun push (T {elts, size}, x) = let val n = !size val _ = Array.update (elts, n, x) val _ = size := n + 1 in () end fun exists (T {elts, size, ...}, f) = Int.exists (0, !size, fn i => f (Array.sub (elts, i))) fun foreach (T {elts, size}, f) = Int.for (0, !size, fn i => f (Array.sub (elts, i))) fun clear (T {size, ...}) = size := 0 end structure NFA: sig (* The states in an NFA are indexed from 0 to n-1, where n is the number * of states. *) type state = int (* State i is final iff Array.sub (final, i). * The outgoing states from state i on input char c are given by * Array2.sub (next, i, Char.ord c). * anchorStarts is sorted in increasing order of state index. *) datatype t = T of {anchorStarts: state list, final: bool array, seen: bool array, stack1: int Stack.t, stack2: int Stack.t, start: state, next: state vector Array2.t} val fromRegexp: Regexp.t -> t val match: {nfa: t, string: string, startPos: int, anchorStart: bool} -> int option val numStates: t -> int end = struct type state = int datatype t = T of {anchorStarts: state list, final: bool array, seen: bool array, stack1: int Stack.t, stack2: int Stack.t, start: state, next: state vector Array2.t} fun numStates (T {next, ...}) = Array2.nRows next (* Simulating an NFA with two stacks and a bit vector, as in Algorithm 3.4 * (page 126) of Compilers: Principles, Techniques, and Tools by Aho, * Sethi, and Ullman. *) fun match {anchorStart: bool, nfa as T {anchorStarts, final, seen, stack1, stack2, start, next}, startPos, string = s}: int option = let val numStates = numStates nfa val n = String.size s val _ = Array.modify (seen, fn _ => false) fun loop (current: state Stack.t, nextStates: state Stack.t, i: int, last: int option): int option = let val last = if Stack.exists (current, fn s => Array.sub (final, s)) then SOME i else last in if numStates = 0 orelse i = n then (Stack.clear stack1 ; Stack.clear stack2 ; last) else let val _ = Array.modify (seen, fn _ => false) val c = Char.toInt (String.sub (s, i)) val _ = Stack.foreach (current, fn s => Vector.foreach (Array2.sub (next, s, c), fn s' => if Array.sub (seen, s') then () else (Array.update (seen, s', true) ; Stack.push (nextStates, s')))) val _ = Stack.clear current in loop (nextStates, current, i + 1, last) end end val _ = Stack.push (stack1, start) val _ = if anchorStart then List.foreach (anchorStarts, fn s => Stack.push (stack1, s)) else () in loop (stack1, stack2, startPos, NONE) end (* This conversion from a regular expression to an NFA is based on * Section 3.9 (pages 134 -- 140) of Compilers: Principles, Techniques, * and Tools by Aho, Sethi, and Ullman. * * It creates one NFA state for each CharSet (called a "position") that is * in the regexp. There is also one extra state for the start state. * It adds edges as in rules 1 and 2 (page 138) for the followpos function. *) fun fromRegexp (r: Regexp.t): t = let fun loop (r, ac) = let open Regexp in case r of AnchorStart => ac + 1 | CharSet _ => ac + 1 | Or rs => List.fold (rs, ac, loop) | Seq rs => List.fold (rs, ac, loop) | Star r => loop (r, ac) end val numPos = loop (r, 0) val numStates = numPos + 1 val start = numPos val posCounter = ref ~1 val follow = Array2.new (numStates, numStates, false) val posChars = Array2.tabulate (numPos, numChars, fn _ => false) local datatype t = T of bool vector in fun contains (T v, s) = Vector.sub (v, s) val empty: t = T (Vector.new (numPos, false)) fun union (T v, T v'): t = T (Vector.tabulate (numPos, fn i => Vector.sub (v, i) orelse Vector.sub (v', i))) fun singleton (i: int): t = T (Vector.tabulate (numPos, fn j => i = j)) fun foreach (T v, f) = Vector.foreachi (v, fn (i, b) => if b then f i else ()) end val anchorStarts = ref [] fun loop r = case r of Regexp.AnchorStart => let val i = ++ posCounter val _ = List.push (anchorStarts, i) val first = singleton i in {first = first, last = first, nullable = false} end | Regexp.CharSet f => let val i = ++ posCounter val _ = Int.for (0, numChars, fn c => if f (Char.chr c) then Array2.update (posChars, i, c, true) else ()) val first = singleton i in {first = first, last = first, nullable = false} end | Regexp.Or rs => List.fold (rs, {first = empty, last = empty, nullable = false}, fn (r, {first = f, last = l, nullable = n}) => let val {first = f', last = l', nullable = n'} = loop r in {first = union (f, f'), last = union (l, l'), nullable = n orelse n'} end) | Regexp.Seq rs => List.fold (rs, {first = empty, last = empty, nullable = true}, fn (r, {first = f, last = l, nullable = n}) => let val {first = f', last = l', nullable = n'} = loop r val _ = foreach (l, fn s => foreach (f', fn s' => Array2.update (follow, s, s', true))) in {first = if n then union (f, f') else f, last = if n' then union (l, l') else l', nullable = n andalso n'} end) | Regexp.Star r => let val {first = f, last = l, nullable = n} = loop r val _ = foreach (l, fn s => foreach (f, fn s' => Array2.update (follow, s, s', true))) in {first = f, last = l, nullable = true} end val {first, last, nullable} = loop r (* Any anchor starts in first should be anchor starts. * This also reverses anchorStarts so they are in order. *) val anchorStarts = List.fold (!anchorStarts, [], fn (s, ac) => if contains (first, s) then s :: ac else ac) val _ = foreach (first, fn i => Array2.update (follow, start, i, true)) val final = Array.array (numStates, false) val _ = foreach (last, fn i => Array.update (final, i, true)) val _ = if nullable then Array.update (final, start, true) else () val a = Array.new (numStates, false) val next = Array2.tabulate (numStates, numChars, fn (i, c) => let val _ = Int.for (0, numStates, fn j => Array.update (a, j, false)) val _ = Int.for (0, numPos, fn j => if Array2.sub (follow, i, j) andalso Array2.sub (posChars, j, c) then Array.update (a, j, true) else ()) in Array.indices a end) in T {anchorStarts = anchorStarts, final = final, next = next, seen = Array.new (numStates, false), stack1 = Stack.new (numStates, ~1), stack2 = Stack.new (numStates, ~1), start = start} end end structure DFA: sig type t val fromNFA: NFA.t -> t val match: {dfa: t, string: string, startPos: int, anchorStart: bool} -> int option val minimize: t -> t end = struct (* The states in a DFA are indexed from 0 to n-1, where n is the number * of states. *) type state = int (* State i is final iff Array.sub (final, i). * The outgoing state from state i on input char c is * Array2.sub (next, i, Char.ord c). *) datatype t = T of {anchorStart: state, dead: bool array, final: bool array, next: state Array2.t, start: state} fun numStates (T {next, ...}): int = Array2.nRows next fun match {dfa as T {anchorStart = ancSt, dead, final, start, next}, string as s, startPos: int, anchorStart: bool}: int option = let val n = String.size s fun loop (i: int, state: int, last: int option): int option = let val last = if Array.sub (final, state) then SOME i else last in if Array.sub (dead, state) orelse i = n then last else loop (i + 1, Array2.sub (next, state, Char.toInt (String.sub (s, i))), last) end in loop (startPos, if anchorStart then ancSt else start, NONE) end fun dead (numStates, final, next) = Array.tabulate (numStates, fn i => not (Array.sub (final, i)) andalso Int.forall (0, numChars, fn c => i = Array2.sub (next, i, c))) (* This DFA minimization algorithm is based on algorithm 3.6 (page 142) * of Compilers: Principles, Techniques, and Tools by Aho, Sethi, and * Ullman. * * It maintains an array, r, that stores for each state s the * representative of the class to which s belongs. * It repeatedly refines an equivalence relation, represented by a list * of classes, where each class is a list of states (i.e. ints). *) fun minimize (dfa as T {anchorStart, final, start, next, ...}): t = let val numStates = numStates dfa type class = int list type classes = class list val repCounter = ref ~1 val change = ref false fun newRep () = (change := true; ++ repCounter) val finRep = newRep () val nonfinRep = newRep () val r = Array.tabulate (numStates, fn i => if Array.sub (final, i) then finRep else nonfinRep) fun rep s = Array.sub (r, s) fun trans (s, c) = rep (Array2.sub (next, s, c)) fun refine (class: class, ac: classes): classes = let val r = List.fold (class, [], fn (state, classes) => let fun loop (classes, ac) = case classes of [] => (case ac of [] => [{class = [state], old = state}] | _ => let val s = newRep () val _ = Array.update (r, state, s) in {class = [state], old = state} :: ac end) | (z as {class, old}) :: classes => if Int.forall (0, numChars, fn c => trans (old, c) = trans (state, c)) then (Array.update (r, state, rep old) ; {class = state :: class, old = old} :: (List.appendRev (classes, ac))) else loop (classes, z :: ac) in loop (classes, []) end) in List.fold (r, ac, fn ({class, ...}, ac) => case class of [_] => ac | _ => class :: ac) end fun refineAll (classes: classes): unit = case classes of [] => () | _ => let val _ = change := false val classes = List.fold (classes, [], fn (class, ac) => case class of [_] => ac | _ => refine (class, ac)) in if !change then refineAll classes else () end val (fin, nonfin) = Int.fold (0, numStates, ([], []), fn (i, (f, n)) => if Array.sub (final, i) then (i :: f, n) else (f, i :: n)) val _ = refineAll [fin, nonfin] val numStates' = 1 + !repCounter val reached = Array.new (numStates', false) fun visit (s: int ): unit = let val s' = rep s in if Array.sub (reached, s') then () else (Array.update (reached, s', true) ; Int.for (0, numChars, fn c => visit (Array2.sub (next, s, c)))) end val _ = visit start val _ = visit anchorStart val c = ref ~1 val newR = Array.tabulate (numStates', fn s => if Array.sub (reached, s) then ++ c else ~1) val numStates' = 1 + !c val _ = Array.modify (r, fn s => Array.sub (newR, s)) val next' = Array2.new (numStates', numChars, ~1) val _ = Array2.foreachi (next, fn (s, c, s') => Array2.update (next', rep s, c, rep s')) val final' = Array.array (numStates', false) val _ = Array.foreachi (final, fn (i, b) => if b then Array.update (final', rep i, true) else ()) in T {anchorStart = rep anchorStart, dead = dead (numStates', final', next'), final = final', start = rep start, next = next'} end (* This is the usual "subset construction", as in algorithm 3.2 (page 118) * of Compilers: Principles, Techniques, and Tools by Aho, Sethi, and * Ullman. * * It associates each (reachable) set of states in the nfa with a single * state in the DFA. *) fun fromNFA (nfa as NFA.T {anchorStarts, final, start, next, ...}) = let type states = state vector val counter = ref ~1 type work = {states: states, state: int, out: int vector option ref} val cache: work list ref = ref [] val todo: work list ref = ref [] fun statesToState (ss: states): int = case List.peek (!cache, fn {states, ...} => ss = states) of NONE => let val state = ++ counter val work = {out = ref NONE, state = state, states = ss} val _ = List.push (cache, work) val _ = List.push (todo, work) in state end | SOME {state, ...} => state local val seen = Array.array (NFA.numStates nfa, false) in fun loop () = case !todo of [] => () | {states, out, ...} :: rest => (todo := rest ; out := (SOME (Vector.tabulate (numChars, fn c => let val _ = Array.modify (seen, fn _ => false) val _ = Vector.foreach (states, fn s => Vector.foreach (Array2.sub (next, s, c), fn s' => Array.update (seen, s', true))) in statesToState (Array.indices seen) end))) ; loop ()) end val start' = statesToState (Vector.new1 start) val anchorStart' = statesToState (Vector.fromList (List.insert (anchorStarts, start, op <=))) val _ = loop () val numStates = 1 + !counter val next' = Array2.new (numStates, numChars, ~1) val final' = Array.new (numStates, false) val _ = List.foreach (!cache, fn {states, state = i, out, ...}: work => let val _ = Vector.foreachi (valOf (! out), fn (c, j) => Array2.update (next', i, c, j)) val _ = if Vector.exists (states, fn s => Array.sub (final, s)) then Array.update (final', i, true) else () in () end) val dead' = dead (numStates, final', next') in T {anchorStart = anchorStart', dead = dead', final = final', start = start', next = next'} end end structure Regexp: sig structure Compiled: sig type t val find: t * string -> {start: int, length: int} option end type t val anchorStart: t val any: t val char: char -> t val compileDFA: t -> Compiled.t val compileNFA: t -> Compiled.t val digit: t val nonDigit: t val notOneOf: string -> t val oneOf: string -> t val or: t list -> t val seq: t list -> t val star: t -> t end = struct open Regexp val anchorStart = AnchorStart val isChar = CharSet fun isNotChar f = isChar (not o f) fun char c = isChar (fn c' => c = c') val or = Or val seq = Seq val star = Star val any = isChar (fn _ => true) fun oneOf s = isChar (fn c => String.contains (s, c)) fun notOneOf s = isNotChar (fn c => String.contains (s, c)) val digs = "0123456789" val digit = oneOf digs val nonDigit = notOneOf digs val empty = Or [] val emptyString = Seq [] structure Compiled = struct datatype t = DFA of DFA.t | NFA of NFA.t fun find (c: t, s: string) = let val n = String.size s fun loop (i: int, anchorStart: bool) = if i = n then NONE else let val res = case c of DFA dfa => DFA.match {dfa = dfa, string = s, startPos = i, anchorStart = anchorStart} | NFA nfa => NFA.match {nfa = nfa, string = s, startPos = i, anchorStart = anchorStart} in case res of NONE => loop (i + 1, false) | SOME finish => SOME {start = i, length = finish - i} end in loop (0, true) end end fun compileDFA r = Compiled.DFA (DFA.minimize (DFA.fromNFA (NFA.fromRegexp r))) fun compileNFA r = Compiled.NFA (NFA.fromRegexp r) end local open Regexp in val d = digit val eol = char #"#" val space = oneOf " \t" val r = seq [or [anchorStart, notOneOf "0123456789("], or [seq [char #"(", d, d, d, char #")"], seq [d, d, d]], char #" ", d, d, d, oneOf " -", d, d, d, d, or [eol, nonDigit]] val comp = Regexp.compileDFA r end fun incr (r: int ref) = r := !r + 1 val ins = TextIO.stdIn fun printl [] = print "\n" | printl(h::t) = ( print h ; printl t ) local val form = "(...) ...-...." val a = CharArray.tabulate (String.size form, fn i => String.sub (form, i)) in fun checkPhone (mustPrint: bool, cnt: int ref, line: string) = case Regexp.Compiled.find (comp, line) of NONE => () | SOME {start = pos, ...} => let fun blit (src, dst, length) = let val stop = src + length fun loop (src, dst) = if src = stop then () else (CharArray.update (a, dst, String.sub (line, src)) ; loop (src + 1, dst + 1)) in loop (src, dst) end val (o1, o2, o3) = if #"(" = String.sub (line, pos) then (1, 6, 10) else if #"(" = String.sub (line, pos + 1) then (2, 7, 11) else if Char.isDigit (String.sub (line, pos)) then (0, 4, 8) else (1, 5, 9) val _ = blit (pos + o1, 1, 3) val _ = blit (pos + o2, 6, 3) val _ = blit (pos + o3, 10, 4) val _ = if mustPrint then printl [Int.toString (!cnt), ": ", CharArray.extract (a, 0, NONE)] else () val _ = incr cnt in () end end fun doit (phones,mustPrint: bool): unit = let val cnt = ref 1 in List.foreach (phones, fn line => checkPhone (mustPrint, cnt, line)) end fun atoi s = case Int.fromString s of SOME num => num | NONE => 0 fun main (name, args) = let val n = atoi (hd (args @ ["1"])) val phones = let fun loop lines = case TextIO.inputLine ins of "" => rev lines | line => loop (line :: lines) in loop [] end val _ = Int.for (1, n, fn _ => doit (phones,false)) val _ = doit (phones,true) in OS.Process.success end end val _ = SMLofNJ.exportFn("regexmatch", Test.main);