(* -*- mode: sml -*-
* $Id: regexmatch.smlnj,v 1.1 2001/07/10 13:04:21 doug Exp $
* http://www.bagley.org/~doug/shootout/
* from Stephen Weeks
* "ported" to SML/NJ
* with help from Daniel Wang
*)
structure Test : sig
val main : (string * string list) -> OS.Process.status
end = struct
fun ++ (r: int ref): int =
let
val n = 1 + !r
val _ = r := n
in n
end
structure Int =
struct
open Int
type t = int
fun exists (start: t, stop: t, f: t -> bool): bool =
let
fun loop i = i < stop andalso (f i orelse loop (i + 1))
in
loop start
end
fun forall (start, stop, f) = not (exists (start, stop, not o f))
fun fold (start: t, stop: t, a: 'a, f: t * 'a -> 'a): 'a =
let
fun loop (i: t, a: 'a) =
if i >= stop
then a
else loop (i + 1, f (i, a))
in loop (start, a)
end
fun for (start: t, stop: t, f: t -> unit): unit =
let
fun loop i =
if i >= stop
then ()
else (f i; loop (i + 1))
in
loop start
end
end
structure Array2 =
struct
open Array2
type 'a t = 'a array
val new = array
fun tabulate (r, c, f) = Array2.tabulate RowMajor (r, c, f)
fun foreachi (a, f) =
appi RowMajor f {base = a, row = 0, col = 0,
nrows = NONE, ncols = NONE}
end
structure Vector =
struct
open Vector
fun exists (v, f) =
Int.exists (0, length v, fn i => f (sub (v, i)))
fun foreach (v, f) = app f v
fun foreachi (v, f) = appi f (v, 0, NONE)
fun new (n, x) = tabulate (n, fn _ => x)
fun new1 x = new (1, x)
end
structure List =
struct
open List
fun foreach (l, f) = app f l
fun fold (l, b, f) = foldl f b l
fun appendRev (l1, l2) = fold (l1, l2, op ::)
fun push (r, x) = r := x :: !r
fun keepAll (l, f) = filter f l
fun peek (l, f) = find f l
fun insert (l, x, op <=) =
let
fun loop (l, ac) =
case l of
[] => appendRev (ac, [x])
| x' :: l' =>
if x <= x'
then appendRev (ac, x :: l)
else loop (l', x' :: ac)
in loop (l, [])
end
end
structure Array =
struct
open Array
val new = array
fun modify (a, f) = Array.modify f a
fun foreachi (a, f) = appi f (a, 0, NONE)
fun indices (a: bool array): int vector =
let
val n = Array.length a
val numTrue =
let
fun loop (i, count) =
if i = n
then count
else loop (i + 1,
if Array.sub (a, i)
then count + 1
else count)
in loop (0, 0)
end
val next = ref 0
fun loop i =
if Array.sub (a, i)
then (next := i + 1; i)
else loop (i + 1)
in Vector.tabulate (numTrue, fn _ => loop (!next))
end
end
structure Char =
struct
open Char
val fromInt = chr
val toInt = ord
end
structure String =
struct
open String
type t = string
fun contains (s: t, c: char): bool =
Int.exists (0, size s, fn i => c = sub (s, i))
end
val numChars: int = 128
structure Regexp =
struct
datatype t =
AnchorStart
| CharSet of char -> bool
| Or of t list
| Seq of t list
| Star of t
end
structure Stack:
sig
type 'a t
val clear: 'a t -> unit
val exists: 'a t * ('a -> bool) -> bool
val foreach: 'a t * ('a -> unit) -> unit
val new: int * 'a -> 'a t
val push: 'a t * 'a -> unit
end =
struct
datatype 'a t = T of {elts: 'a array,
size: int ref}
fun new (size: int, dummy: 'a): 'a t =
T {elts = Array.new (size, dummy),
size = ref 0}
fun push (T {elts, size}, x) =
let
val n = !size
val _ = Array.update (elts, n, x)
val _ = size := n + 1
in ()
end
fun exists (T {elts, size, ...}, f) =
Int.exists (0, !size, fn i => f (Array.sub (elts, i)))
fun foreach (T {elts, size}, f) =
Int.for (0, !size, fn i => f (Array.sub (elts, i)))
fun clear (T {size, ...}) = size := 0
end
structure NFA:
sig
(* The states in an NFA are indexed from 0 to n-1, where n is the number
* of states.
*)
type state = int
(* State i is final iff Array.sub (final, i).
* The outgoing states from state i on input char c are given by
* Array2.sub (next, i, Char.ord c).
* anchorStarts is sorted in increasing order of state index.
*)
datatype t = T of {anchorStarts: state list,
final: bool array,
seen: bool array,
stack1: int Stack.t,
stack2: int Stack.t,
start: state,
next: state vector Array2.t}
val fromRegexp: Regexp.t -> t
val match: {nfa: t,
string: string,
startPos: int,
anchorStart: bool} -> int option
val numStates: t -> int
end =
struct
type state = int
datatype t = T of {anchorStarts: state list,
final: bool array,
seen: bool array,
stack1: int Stack.t,
stack2: int Stack.t,
start: state,
next: state vector Array2.t}
fun numStates (T {next, ...}) = Array2.nRows next
(* Simulating an NFA with two stacks and a bit vector, as in Algorithm 3.4
* (page 126) of Compilers: Principles, Techniques, and Tools by Aho,
* Sethi, and Ullman.
*)
fun match {anchorStart: bool,
nfa as T {anchorStarts, final, seen, stack1, stack2, start,
next},
startPos,
string = s}: int option =
let
val numStates = numStates nfa
val n = String.size s
val _ = Array.modify (seen, fn _ => false)
fun loop (current: state Stack.t,
nextStates: state Stack.t,
i: int,
last: int option): int option =
let
val last =
if Stack.exists (current, fn s => Array.sub (final, s))
then SOME i
else last
in
if numStates = 0 orelse i = n
then (Stack.clear stack1
; Stack.clear stack2
; last)
else
let
val _ = Array.modify (seen, fn _ => false)
val c = Char.toInt (String.sub (s, i))
val _ =
Stack.foreach (current, fn s =>
Vector.foreach
(Array2.sub (next, s, c),
fn s' =>
if Array.sub (seen, s')
then ()
else (Array.update (seen, s', true)
; Stack.push (nextStates, s'))))
val _ = Stack.clear current
in loop (nextStates, current, i + 1, last)
end
end
val _ = Stack.push (stack1, start)
val _ =
if anchorStart
then List.foreach (anchorStarts, fn s =>
Stack.push (stack1, s))
else ()
in
loop (stack1, stack2, startPos, NONE)
end
(* This conversion from a regular expression to an NFA is based on
* Section 3.9 (pages 134 -- 140) of Compilers: Principles, Techniques,
* and Tools by Aho, Sethi, and Ullman.
*
* It creates one NFA state for each CharSet (called a "position") that is
* in the regexp. There is also one extra state for the start state.
* It adds edges as in rules 1 and 2 (page 138) for the followpos function.
*)
fun fromRegexp (r: Regexp.t): t =
let
fun loop (r, ac) =
let open Regexp
in case r of
AnchorStart => ac + 1
| CharSet _ => ac + 1
| Or rs => List.fold (rs, ac, loop)
| Seq rs => List.fold (rs, ac, loop)
| Star r => loop (r, ac)
end
val numPos = loop (r, 0)
val numStates = numPos + 1
val start = numPos
val posCounter = ref ~1
val follow = Array2.new (numStates, numStates, false)
val posChars = Array2.tabulate (numPos, numChars, fn _ => false)
local
datatype t = T of bool vector
in
fun contains (T v, s) = Vector.sub (v, s)
val empty: t = T (Vector.new (numPos, false))
fun union (T v, T v'): t =
T (Vector.tabulate (numPos, fn i =>
Vector.sub (v, i)
orelse Vector.sub (v', i)))
fun singleton (i: int): t =
T (Vector.tabulate (numPos, fn j => i = j))
fun foreach (T v, f) =
Vector.foreachi (v, fn (i, b) => if b then f i else ())
end
val anchorStarts = ref []
fun loop r =
case r of
Regexp.AnchorStart =>
let
val i = ++ posCounter
val _ = List.push (anchorStarts, i)
val first = singleton i
in
{first = first,
last = first,
nullable = false}
end
| Regexp.CharSet f =>
let
val i = ++ posCounter
val _ =
Int.for (0, numChars, fn c =>
if f (Char.chr c)
then Array2.update (posChars, i, c, true)
else ())
val first = singleton i
in {first = first,
last = first,
nullable = false}
end
| Regexp.Or rs =>
List.fold
(rs, {first = empty,
last = empty,
nullable = false},
fn (r, {first = f, last = l, nullable = n}) =>
let
val {first = f', last = l', nullable = n'} =
loop r
in
{first = union (f, f'),
last = union (l, l'),
nullable = n orelse n'}
end)
| Regexp.Seq rs =>
List.fold
(rs, {first = empty,
last = empty,
nullable = true},
fn (r, {first = f, last = l, nullable = n}) =>
let
val {first = f', last = l', nullable = n'} =
loop r
val _ =
foreach
(l, fn s =>
foreach
(f', fn s' => Array2.update (follow, s, s', true)))
in
{first = if n then union (f, f') else f,
last = if n' then union (l, l') else l',
nullable = n andalso n'}
end)
| Regexp.Star r =>
let
val {first = f, last = l, nullable = n} = loop r
val _ =
foreach
(l, fn s =>
foreach
(f, fn s' => Array2.update (follow, s, s', true)))
in
{first = f, last = l, nullable = true}
end
val {first, last, nullable} = loop r
(* Any anchor starts in first should be anchor starts.
* This also reverses anchorStarts so they are in order.
*)
val anchorStarts =
List.fold (!anchorStarts, [], fn (s, ac) =>
if contains (first, s) then s :: ac else ac)
val _ = foreach (first, fn i =>
Array2.update (follow, start, i, true))
val final = Array.array (numStates, false)
val _ = foreach (last, fn i => Array.update (final, i, true))
val _ = if nullable then Array.update (final, start, true) else ()
val a = Array.new (numStates, false)
val next =
Array2.tabulate
(numStates, numChars, fn (i, c) =>
let
val _ =
Int.for (0, numStates, fn j => Array.update (a, j, false))
val _ =
Int.for
(0, numPos, fn j =>
if Array2.sub (follow, i, j)
andalso Array2.sub (posChars, j, c)
then Array.update (a, j, true)
else ())
in Array.indices a
end)
in
T {anchorStarts = anchorStarts,
final = final,
next = next,
seen = Array.new (numStates, false),
stack1 = Stack.new (numStates, ~1),
stack2 = Stack.new (numStates, ~1),
start = start}
end
end
structure DFA:
sig
type t
val fromNFA: NFA.t -> t
val match: {dfa: t,
string: string,
startPos: int,
anchorStart: bool} -> int option
val minimize: t -> t
end =
struct
(* The states in a DFA are indexed from 0 to n-1, where n is the number
* of states.
*)
type state = int
(* State i is final iff Array.sub (final, i).
* The outgoing state from state i on input char c is
* Array2.sub (next, i, Char.ord c).
*)
datatype t = T of {anchorStart: state,
dead: bool array,
final: bool array,
next: state Array2.t,
start: state}
fun numStates (T {next, ...}): int = Array2.nRows next
fun match {dfa as T {anchorStart = ancSt, dead, final, start, next},
string as s,
startPos: int,
anchorStart: bool}: int option =
let
val n = String.size s
fun loop (i: int, state: int, last: int option): int option =
let
val last =
if Array.sub (final, state)
then SOME i
else last
in
if Array.sub (dead, state) orelse i = n
then last
else loop (i + 1,
Array2.sub (next, state,
Char.toInt (String.sub (s, i))),
last)
end
in loop (startPos,
if anchorStart then ancSt else start,
NONE)
end
fun dead (numStates, final, next) =
Array.tabulate
(numStates, fn i =>
not (Array.sub (final, i))
andalso Int.forall (0, numChars, fn c =>
i = Array2.sub (next, i, c)))
(* This DFA minimization algorithm is based on algorithm 3.6 (page 142)
* of Compilers: Principles, Techniques, and Tools by Aho, Sethi, and
* Ullman.
*
* It maintains an array, r, that stores for each state s the
* representative of the class to which s belongs.
* It repeatedly refines an equivalence relation, represented by a list
* of classes, where each class is a list of states (i.e. ints).
*)
fun minimize (dfa as T {anchorStart, final, start, next, ...}): t =
let
val numStates = numStates dfa
type class = int list
type classes = class list
val repCounter = ref ~1
val change = ref false
fun newRep () = (change := true; ++ repCounter)
val finRep = newRep ()
val nonfinRep = newRep ()
val r = Array.tabulate (numStates, fn i =>
if Array.sub (final, i)
then finRep
else nonfinRep)
fun rep s = Array.sub (r, s)
fun trans (s, c) = rep (Array2.sub (next, s, c))
fun refine (class: class, ac: classes): classes =
let
val r =
List.fold
(class, [], fn (state, classes) =>
let
fun loop (classes, ac) =
case classes of
[] =>
(case ac of
[] => [{class = [state],
old = state}]
| _ =>
let
val s = newRep ()
val _ = Array.update (r, state, s)
in {class = [state],
old = state} :: ac
end)
| (z as {class, old}) :: classes =>
if Int.forall
(0, numChars, fn c =>
trans (old, c) = trans (state, c))
then
(Array.update (r, state, rep old)
; {class = state :: class,
old = old} :: (List.appendRev
(classes, ac)))
else loop (classes, z :: ac)
in loop (classes, [])
end)
in List.fold (r, ac, fn ({class, ...}, ac) =>
case class of
[_] => ac
| _ => class :: ac)
end
fun refineAll (classes: classes): unit =
case classes of
[] => ()
| _ =>
let
val _ = change := false
val classes =
List.fold (classes, [], fn (class, ac) =>
case class of
[_] => ac
| _ => refine (class, ac))
in if !change
then refineAll classes
else ()
end
val (fin, nonfin) =
Int.fold (0, numStates, ([], []), fn (i, (f, n)) =>
if Array.sub (final, i)
then (i :: f, n)
else (f, i :: n))
val _ = refineAll [fin, nonfin]
val numStates' = 1 + !repCounter
val reached = Array.new (numStates', false)
fun visit (s: int ): unit =
let
val s' = rep s
in
if Array.sub (reached, s')
then ()
else (Array.update (reached, s', true)
; Int.for (0, numChars, fn c =>
visit (Array2.sub (next, s, c))))
end
val _ = visit start
val _ = visit anchorStart
val c = ref ~1
val newR = Array.tabulate (numStates', fn s =>
if Array.sub (reached, s)
then ++ c
else ~1)
val numStates' = 1 + !c
val _ = Array.modify (r, fn s => Array.sub (newR, s))
val next' = Array2.new (numStates', numChars, ~1)
val _ =
Array2.foreachi
(next, fn (s, c, s') =>
Array2.update (next', rep s, c, rep s'))
val final' = Array.array (numStates', false)
val _ =
Array.foreachi
(final, fn (i, b) =>
if b then Array.update (final', rep i, true) else ())
in T {anchorStart = rep anchorStart,
dead = dead (numStates', final', next'),
final = final',
start = rep start,
next = next'}
end
(* This is the usual "subset construction", as in algorithm 3.2 (page 118)
* of Compilers: Principles, Techniques, and Tools by Aho, Sethi, and
* Ullman.
*
* It associates each (reachable) set of states in the nfa with a single
* state in the DFA.
*)
fun fromNFA (nfa as NFA.T {anchorStarts, final, start, next, ...}) =
let
type states = state vector
val counter = ref ~1
type work = {states: states,
state: int,
out: int vector option ref}
val cache: work list ref = ref []
val todo: work list ref = ref []
fun statesToState (ss: states): int =
case List.peek (!cache, fn {states, ...} => ss = states) of
NONE =>
let
val state = ++ counter
val work = {out = ref NONE,
state = state,
states = ss}
val _ = List.push (cache, work)
val _ = List.push (todo, work)
in
state
end
| SOME {state, ...} => state
local
val seen = Array.array (NFA.numStates nfa, false)
in
fun loop () =
case !todo of
[] => ()
| {states, out, ...} :: rest =>
(todo := rest
; out := (SOME
(Vector.tabulate
(numChars, fn c =>
let
val _ =
Array.modify (seen, fn _ => false)
val _ =
Vector.foreach
(states, fn s =>
Vector.foreach
(Array2.sub (next, s, c), fn s' =>
Array.update (seen, s', true)))
in statesToState (Array.indices seen)
end)))
; loop ())
end
val start' = statesToState (Vector.new1 start)
val anchorStart' =
statesToState
(Vector.fromList (List.insert (anchorStarts, start, op <=)))
val _ = loop ()
val numStates = 1 + !counter
val next' = Array2.new (numStates, numChars, ~1)
val final' = Array.new (numStates, false)
val _ =
List.foreach
(!cache, fn {states, state = i, out, ...}: work =>
let
val _ =
Vector.foreachi
(valOf (! out), fn (c, j) =>
Array2.update (next', i, c, j))
val _ =
if Vector.exists (states, fn s => Array.sub (final, s))
then Array.update (final', i, true)
else ()
in ()
end)
val dead' = dead (numStates, final', next')
in T {anchorStart = anchorStart',
dead = dead',
final = final',
start = start',
next = next'}
end
end
structure Regexp:
sig
structure Compiled:
sig
type t
val find: t * string -> {start: int, length: int} option
end
type t
val anchorStart: t
val any: t
val char: char -> t
val compileDFA: t -> Compiled.t
val compileNFA: t -> Compiled.t
val digit: t
val nonDigit: t
val notOneOf: string -> t
val oneOf: string -> t
val or: t list -> t
val seq: t list -> t
val star: t -> t
end =
struct
open Regexp
val anchorStart = AnchorStart
val isChar = CharSet
fun isNotChar f = isChar (not o f)
fun char c = isChar (fn c' => c = c')
val or = Or
val seq = Seq
val star = Star
val any = isChar (fn _ => true)
fun oneOf s = isChar (fn c => String.contains (s, c))
fun notOneOf s = isNotChar (fn c => String.contains (s, c))
val digs = "0123456789"
val digit = oneOf digs
val nonDigit = notOneOf digs
val empty = Or []
val emptyString = Seq []
structure Compiled =
struct
datatype t =
DFA of DFA.t
| NFA of NFA.t
fun find (c: t, s: string) =
let
val n = String.size s
fun loop (i: int, anchorStart: bool) =
if i = n
then NONE
else
let
val res =
case c of
DFA dfa =>
DFA.match {dfa = dfa,
string = s,
startPos = i,
anchorStart = anchorStart}
| NFA nfa =>
NFA.match {nfa = nfa,
string = s,
startPos = i,
anchorStart = anchorStart}
in
case res of
NONE => loop (i + 1, false)
| SOME finish => SOME {start = i,
length = finish - i}
end
in loop (0, true)
end
end
fun compileDFA r =
Compiled.DFA (DFA.minimize (DFA.fromNFA (NFA.fromRegexp r)))
fun compileNFA r =
Compiled.NFA (NFA.fromRegexp r)
end
local
open Regexp
in
val d = digit
val eol = char #"#"
val space = oneOf " \t"
val r =
seq [or [anchorStart, notOneOf "0123456789("],
or [seq [char #"(", d, d, d, char #")"],
seq [d, d, d]],
char #" ",
d, d, d,
oneOf " -",
d, d, d, d,
or [eol, nonDigit]]
val comp = Regexp.compileDFA r
end
fun incr (r: int ref) = r := !r + 1
val ins = TextIO.stdIn
fun printl [] = print "\n" | printl(h::t) = ( print h ; printl t )
local
val form = "(...) ...-...."
val a = CharArray.tabulate (String.size form, fn i =>
String.sub (form, i))
in
fun checkPhone (mustPrint: bool, cnt: int ref, line: string) =
case Regexp.Compiled.find (comp, line) of
NONE => ()
| SOME {start = pos, ...} =>
let
fun blit (src, dst, length) =
let
val stop = src + length
fun loop (src, dst) =
if src = stop
then ()
else (CharArray.update (a, dst,
String.sub (line, src))
; loop (src + 1, dst + 1))
in
loop (src, dst)
end
val (o1, o2, o3) =
if #"(" = String.sub (line, pos)
then (1, 6, 10)
else if #"(" = String.sub (line, pos + 1)
then (2, 7, 11)
else if Char.isDigit (String.sub (line, pos))
then (0, 4, 8)
else (1, 5, 9)
val _ = blit (pos + o1, 1, 3)
val _ = blit (pos + o2, 6, 3)
val _ = blit (pos + o3, 10, 4)
val _ =
if mustPrint
then printl [Int.toString (!cnt), ": ",
CharArray.extract (a, 0, NONE)]
else ()
val _ = incr cnt
in
()
end
end
fun doit (phones,mustPrint: bool): unit =
let val cnt = ref 1
in List.foreach (phones, fn line => checkPhone (mustPrint, cnt, line))
end
fun atoi s = case Int.fromString s of SOME num => num | NONE => 0
fun main (name, args) =
let
val n = atoi (hd (args @ ["1"]))
val phones =
let
fun loop lines =
case TextIO.inputLine ins of
"" => rev lines
| line => loop (line :: lines)
in loop []
end
val _ = Int.for (1, n, fn _ => doit (phones,false))
val _ = doit (phones,true)
in OS.Process.success
end
end
val _ = SMLofNJ.exportFn("regexmatch", Test.main);