#
# Concurrent toxline dedup.
# Usage: ./t2 [-n=nthreads] datfile [outfile]
#
#
# Need a (multi-platform) way to detect # of cores present!
#
$define DEFAULT_NTHREADS 64
global nthreads
class doc(docno,
queue, # [...] list of strings
text,
knowndup)
end
class threaddata(id, # thread id
schunk, # substring this thread works on
Ls, # list of string lines
Lqueue, # list of doc objects
Sdocnos, # set of its docnos
dupsfound
)
method computeLs()
Ls := []
Lqueue := []
Sdocnos := set()
#write("thread ", id, " ", schunk[1:20], "...", schunk[-20:0])
schunk ? {
every i := find("\n",schunk) do {
line := tab(i)
if find(" 1900", line) then
line := replacem(line, " 1900", " 2000")
move(1)
put(Ls,line)
if find("",line) then {
queue := []
recstartindex := i-*line
put(queue, line)
} else if find("",line) then {
put(queue, line)
recendindex := i
# this doesn't seem to be happening, but I worry
if /docno then {
stopstr := "uh oh, null docno!\n"
every stopstr ||:= !queue do stopstr ||:= "\n"
stop(stopstr)
}
put(Lqueue,
doc(docno, queue,
schunk[recstartindex:recendindex],
if member(Sdocnos, docno) then {
write("wow parallel founddup")
dupsfound+:=1
1 }
else { insert(Sdocnos, docno); &null }
))
queue := &null
docno := &null
} else {
if \queue then {
put(queue, line)
line? if tab(find("")+7) then {
docno := tab(find(""))
}
}
else if *line >0 then write("discarded ", image(line))
}
}
srest := tab(0)
if *srest > 0 then { put(Ls, srest) }
}
end
initially
dupsfound := 0
end
global wholefile, fout, fdup, docnos
procedure main(av)
if match("-n=", av[1]) then {
nthreads := integer(av[1][4:0])
pop(av)
}
else nthreads := DEFAULT_NTHREADS
write("super-toxline-dedup startup, using ", nthreads, " threads")
docnos := set()
sz := stat(av[1]).size
fin := open(av[1], "r") | stop("Cannot open " || av[1])
write("size is ", sz)
t1 := &time
count := 0
wholefile := reads(fin,sz)
t2 := &time
write("read ", *wholefile, " bytes in ", t2 - t1, "ms")
close(fin)
fout := open(av[2] | (av[1]||".out"), "w") | stop("Cannot open " || av[2])
fdup := open("toxline.dups", "w") |
stop("Cannot open for write: toxline.dups.")
Ltd := list(nthreads)
chunksize := *wholefile / nthreads
startindex := 1
every i := 1 to nthreads-1 do {
endindex := startindex+chunksize
endindex := find("",wholefile,endindex)+6
Ltd[i] := threaddata(i, wholefile[startindex : endindex])
startindex := endindex
}
Ltd[nthreads] := threaddata(nthreads, wholefile[startindex : 0])
Lthr := list(nthreads)
every i := 1 to nthreads do
Lthr[i] := thread Ltd[i].computeLs()
every wait(!Lthr)
sumlines := 0
every sumlines +:= *((!Ltd).Ls)
nrecs := 0
every nrecs +:= *((!Ltd).Lqueue)
t3 := &time
write("computed ", sumlines, " lines, ", nrecs, " records in ", t3 - t2,"ms")
# 282,000 times...
n := 0
out_to_fout := ""
out_to_fdup := ""
every rec := !((!Ltd).Lqueue) do {
n +:= 1
if n % 1000 = 0 then {
write("dup/out ", n)
writes(fdup,out_to_fdup)
writes(fout,out_to_fout)
out_to_fout := ""
out_to_fdup := ""
}
if member(docnos, rec.docno) then {
out_to_fdup ||:= rec.docno
out_to_fdup ||:= "\n"
}
else {
insert(docnos, rec.docno)
out_to_fout ||:= rec.text
out_to_fout ||:= "\n"
}
}
writes(fdup,out_to_fdup)
writes(fout,out_to_fout)
end