# mytoxline-dedup.icn
# one reader thread (main) that just filters the junk before
# one analyzer thread that just looks for docno's
# one writer thread
# basic question: is it faster to send a line at a time between threads,
# or faster to second record at a time in a small batch mode
# next question: faster to use send/receive, or faster to use a shared queue
# i.e. is it better to send/receive, or better to put/get
# third question: would it be better to transmit with finer (one-line) or coarser (larger batches) granularity?
# normally in thread communication, the cost of each transmission would be a factor.
# fourth question: why does it segfault? :-)
link strings
global fout, fdup, docnos, queue, mywriter, myanalyst
procedure writer()
local x, line, i
i := 0
repeat {
i +:= 1
if i % 10000 = 0 then
write("writing ", i)
x := <<@
if /x then break
while write(fout, pop(x))
}
end
procedure analyzer()
local x, line, docno, i
queue := [ ]
i := 0
repeat {
i +:= 1
if i % 10000 = 0 then
write("analyzing ", i)
x := <<@
if /x then break
while line := pop(x) do {
if find("", line) then {
line ? {
move(7)
docno := trim(tab(find("<")))
docnos[docno] +:= 1
if (docnos[docno] > 1) then {
queue := []
write(fdup, docno)
while (y := pop(x)) & (not find("", y))
break
}
}
put(queue, line)
}
else if find(" 1900", line) then {
line := replacem(line, " 1900", " 2000")
put(queue, line)
}
else if find("", line) then {
put(queue, line)
if docnos[docno] = 1 then {
queue @>> mywriter
}
queue := []
break
}
else put(queue, line)
}
}
end
global inqueue
procedure main(args)
local i
docnos := table(0)
mywriter := thread writer()
myanalyst := thread analyzer()
if *args ~= 2 then
stop("Usage: " || &progname || " infile outfile")
mysize := stat(args[1]).size
write("input size is ", mysize)
fin := open(args[1], "r") |
stop("Cannot open " || args[1])
fout := open("td3." || args[2], "w") |
stop("Cannot open " || args[2])
fdup := open("td3.toxline.dups", "w") |
stop("Cannot open for write: toxline.dups.")
inqueue := []
wholefile := reads(fin, mysize) | stop("reads failed")
write("read in ", *wholefile, " bytes")
i := 0
wholefile ? {
while line := tab(find("\n")|0) do {
i +:= 1
if i % 10000 = 0 then write("reading line ", i)
if not find("", line) then {
if ="\n" then next
if &pos = *wholefile+1 then break
write("input confusion line ", i, ": ",
image(&subject[&pos:0]))
break
}
else { # found a
put(inqueue, line)
="\n"
while (line := tab(find("\n"))) do {
#write("reading line ", image(line))
if (not find("", line)) then {
put(inqueue, line)
} else { break }
move(1)
}
put(inqueue, line)
inqueue @>> myanalyst
inqueue := [ ]
}
if not ="\n" then break
}
write("finished main/outer while loop")
}
&null @>> mywriter
&null @>> myanalyst
wait(myanalyst)
wait(mywriter)
close(fin)
close(fout)
close(fdup)
end