(* -*- mode: sml -*-
* $Id: wordfreq.smlnj,v 1.3 2001/07/09 00:25:29 doug Exp $
* http://www.bagley.org/~doug/shootout/
* from Stephen Weeks
*)
fun for (start, stop, f) =
let
fun loop i =
if i > stop
then ()
else (f i; loop (i + 1))
in
loop start
end
fun incr r = r := 1 + !r
val sub = Array.sub
val update = Array.update
signature HASH_SET =
sig
type 'a t
val foreach: 'a t * ('a -> unit) -> unit
(* lookupOrInsert (s, h, p, f) looks in the set s for an entry with hash h
* satisfying predicate p. If the entry is there, it is returned.
* Otherwise, the function f is called to create a new entry, which is
* inserted and returned.
*)
val lookupOrInsert: 'a t * word * ('a -> bool) * (unit -> 'a) -> 'a
val new: {hash: 'a -> word} -> 'a t
val size: 'a t -> int
end
structure HashSet: HASH_SET =
struct
datatype 'a t =
T of {buckets: 'a list array ref,
hash: 'a -> word,
mask: word ref,
numItems: int ref}
val initialSize: int = 65536
val initialMask: word = Word.fromInt initialSize - 0w1
fun 'a new {hash}: 'a t =
T {buckets = ref (Array.array (initialSize, [])),
hash = hash,
numItems = ref 0,
mask = ref initialMask}
fun size (T {numItems, ...}) = !numItems
fun numBuckets (T {buckets, ...}) = Array.length (!buckets)
fun index (w: word, mask: word): int =
Word.toInt (Word.andb (w, mask))
fun resize (T {buckets, hash, mask, ...}, size: int, newMask: word): unit =
let
val newBuckets = Array.array (size, [])
in Array.app (fn r =>
List.app (fn a =>
let val j = index (hash a, newMask)
in Array.update
(newBuckets, j,
a :: Array.sub (newBuckets, j))
end) r) (!buckets)
; buckets := newBuckets
; mask := newMask
end
fun maybeGrow (s as T {buckets, mask, numItems, ...}): unit =
let
val n = Array.length (!buckets)
in if !numItems * 4 > n
then resize (s,
n * 2,
Word.orb (0w1, Word.<< (!mask, 0w1)))
else ()
end
fun peekGen (T {buckets = ref buckets, mask, ...}, w, p, no, yes) =
let
val j = index (w, !mask)
val b = Array.sub (buckets, j)
in case List.find p b of
NONE => no (j, b)
| SOME a => yes a
end
fun lookupOrInsert (table as T {buckets, numItems, ...}, w, p, f) =
let
fun no (j, b) =
let val a = f ()
val _ = incr numItems
val _ = Array.update (!buckets, j, a :: b)
val _ = maybeGrow table
in a
end
in peekGen (table, w, p, no, fn x => x)
end
fun foreach (T {buckets, ...}, f) =
Array.app (fn r => List.app f r) (!buckets)
end
structure Buffer:
sig
type t
val add: t * Word8.word -> unit
val clear: t -> unit
val contents: t -> string
val new: int -> t
end =
struct
datatype t = T of {elts: Word8Array.array ref,
size: int ref}
fun contents (T {elts, size, ...}) =
Byte.bytesToString (Word8Array.extract (!elts, 0, SOME (!size)))
fun clear (T {size, ...}) = size := 0
fun new (bufSize) =
T {elts = ref (Word8Array.array (bufSize, 0w0)),
size = ref 0}
fun add (T {elts, size}, x) =
let
val s = !size
val _ = size := s + 1
val a = !elts
val n = Word8Array.length a
in
if s = n
then
let
val a' =
Word8Array.tabulate
(2 * n, fn i =>
if i < n then Word8Array.sub (a, i) else 0w0)
val _ = elts := a'
val _ = Word8Array.update (a', s, x)
in ()
end
else Word8Array.update (a, s, x)
end
end
structure Quicksort:
sig
val quicksort: 'a array * ('a * 'a -> bool) -> unit
end =
struct
fun assert (s, f: unit -> bool) =
if true orelse f ()
then ()
else raise Fail (concat ["assert: ", s])
fun forall (low, high, f) =
let
fun loop i = i > high orelse (f i andalso loop (i + 1))
in
loop low
end
fun fold (l, u, state, f) =
let
fun loop (i, state) =
if i > u
then state
else loop (i + 1, f (i, state))
in
loop (l, state)
end
fun 'a isSorted (a: 'a array,
lo: int,
hi: int,
op <= : 'a * 'a -> bool) =
let
fun loop (i, x) =
i > hi
orelse let
val y = sub (a, i)
in
x <= y andalso loop (i + 1, y)
end
in
lo >= hi orelse loop (lo + 1, sub (a, lo))
end
local
open Word
val seed = ref 0w13
in
fun rand () =
let
val res = 0w1664525 * !seed + 0w1013904223
val _ = seed := res
in
toIntX res
end
end
fun randInt (lo, hi) = lo + Int.mod (rand(), hi - lo + 1)
fun insertionSort (a: 'a array, op <= : 'a * 'a -> bool): unit =
let
fun x i = sub (a, i)
in
for (1, Array.length a - 1, fn i =>
let
val _ =
assert ("insertionSort1", fn () =>
isSorted (a, 0, i - 1, op <=))
val t = x i
fun sift (j: int) =
(assert ("insertionSort2", fn () =>
isSorted (a, 0, j - 1, op <=)
andalso isSorted (a, j + 1, i, op <=)
andalso forall (j + 1, i, fn k => t <= x k))
; if j > 0
then
let
val j' = j - 1
val z = x j'
in if t <= z
then (update (a, j, z);
sift j')
else j
end
else j)
val _ = update (a, sift i, t)
in ()
end)
end
fun 'a quicksort (a: 'a array, op <= : 'a * 'a -> bool): unit =
let
fun x i = Array.sub (a, i)
fun swap (i, j) =
let
val t = x i
val _ = update (a, i, x j)
val _ = update (a, j, t)
in ()
end
val cutoff = 20
fun qsort (l: int, u: int): unit =
if u - l > cutoff
then
let
val _ = swap (l, randInt (l, u))
val t = x l
val m =
fold
(l + 1, u, l, fn (i, m) =>
(assert
("qsort", fn () =>
forall (l + 1, m, fn k => x k <= t)
andalso forall (m + 1, i - 1, fn k => not (x k <= t)))
; if x i <= t
then (swap (m + 1, i)
; m + 1)
else m))
val _ = swap (l, m)
val _ = qsort (l, m - 1)
val _ = qsort (m + 1, u)
in ()
end
else ()
val max = Array.length a - 1
val _ = qsort (0, max)
val _ = insertionSort (a, op <=)
in
()
end
end
structure Test : sig
val main : (string * string list) -> OS.Process.status
end = struct
(* This hash function is taken from pages 56-57 of
* The Practice of Programming by Kernighan and Pike.
*)
fun hash (s: string): word =
let
val n = String.size s
fun loop (i, w) =
if i = n
then w
else Word.fromInt (Char.ord (String.sub (s, i))) + Word.* (w, 0w31)
in
loop (0, 0w0)
end
fun hash (s: string): word =
let
val n = String.size s
fun loop (i, w) =
if i = n
then w
else loop (i + 1,
Word.fromInt (Char.ord (String.sub (s, i)))
+ Word.* (w, 0w31))
in
loop (0, 0w0)
end
val max = 4096
val buf = Word8Array.array (max, 0w0)
val count: {hash: word,
word: string,
count: int ref} HashSet.t = HashSet.new {hash = #hash}
val wbuf = Buffer.new 64
val c2b = Byte.charToByte
fun scan_words (i, n, inword) =
if i < n
then
let
val c = Word8Array.sub (buf, i)
in
if c2b #"a" <= c andalso c <= c2b #"z"
then (Buffer.add (wbuf, c);
scan_words (i + 1, n, true))
else
if c2b #"A" <= c andalso c <= c2b #"Z"
then
(Buffer.add (wbuf, c + 0w32);
scan_words (i + 1, n, true))
else
if inword
then
let
val w = Buffer.contents wbuf
val h = hash w
in
incr (#count
(HashSet.lookupOrInsert
(count, h,
fn {hash, word, ...} =>
hash = h andalso word = w,
fn () => {hash = h, word = w, count = ref 0})));
Buffer.clear wbuf;
scan_words (i + 1, n, false)
end
else scan_words (i + 1, n, false)
end
else
let
val nread =
Posix.IO.readArr (Posix.FileSys.stdin,
{buf = buf, i = 0, sz = NONE})
in
if nread = 0
then ()
else scan_words (0, nread, inword)
end
fun printl [] = print "\n" | printl(h::t) = ( print h ; printl t )
fun rightJustify (s: string, width: int) =
let
val n = String.size s
in concat [CharVector.tabulate (width - n, fn _ => #" "), s]
end
fun main (name, args) =
let
val _ = scan_words (0, 0, false)
val a = Array.array (HashSet.size count, (0, ""))
val i = ref 0
val _ = HashSet.foreach (count, fn {word, count, ...} =>
(Array.update (a, !i, (!count, word)); incr i))
val _ = Quicksort.quicksort (a, fn ((c, w), (c', w')) =>
c > c' orelse c = c' andalso w >= w')
val _ = Array.app (fn (c, w) =>
printl [rightJustify (Int.toString c, 7), "\t", w]) a
in
OS.Process.success
end
end
val _ = SMLofNJ.exportFn("wordfreq", Test.main);