# mytoxline-dedup.icn # one reader thread (main) that just filters the junk before # one analyzer thread that just looks for docno's # one writer thread # basic question: is it faster to send a line at a time between threads, # or faster to second record at a time in a small batch mode # next question: faster to use send/receive, or faster to use a shared queue # i.e. is it better to send/receive, or better to put/get # third question: would it be better to transmit with finer (one-line) or coarser (larger batches) granularity? # normally in thread communication, the cost of each transmission would be a factor. # fourth question: why does it segfault? :-) link strings global fout, fdup, docnos, queue, mywriter, myanalyst procedure writer() local x, line, i i := 0 repeat { i +:= 1 if i % 10000 = 0 then write("writing ", i) x := <<@ if /x then break while write(fout, pop(x)) } end procedure analyzer() local x, line, docno, i queue := [ ] i := 0 repeat { i +:= 1 if i % 10000 = 0 then write("analyzing ", i) x := <<@ if /x then break while line := pop(x) do { if find("", line) then { line ? { move(7) docno := trim(tab(find("<"))) docnos[docno] +:= 1 if (docnos[docno] > 1) then { queue := [] write(fdup, docno) while (y := pop(x)) & (not find("", y)) break } } put(queue, line) } else if find(" 1900", line) then { line := replacem(line, " 1900", " 2000") put(queue, line) } else if find("", line) then { put(queue, line) if docnos[docno] = 1 then { queue @>> mywriter } queue := [] break } else put(queue, line) } } end global inqueue procedure main(args) local i docnos := table(0) mywriter := thread writer() myanalyst := thread analyzer() if *args ~= 2 then stop("Usage: " || &progname || " infile outfile") mysize := stat(args[1]).size write("input size is ", mysize) fin := open(args[1], "r") | stop("Cannot open " || args[1]) fout := open("td3." || args[2], "w") | stop("Cannot open " || args[2]) fdup := open("td3.toxline.dups", "w") | stop("Cannot open for write: toxline.dups.") inqueue := [] wholefile := reads(fin, mysize) | stop("reads failed") write("read in ", *wholefile, " bytes") i := 0 wholefile ? { while line := tab(find("\n")|0) do { i +:= 1 if i % 10000 = 0 then write("reading line ", i) if not find("", line) then { if ="\n" then next if &pos = *wholefile+1 then break write("input confusion line ", i, ": ", image(&subject[&pos:0])) break } else { # found a put(inqueue, line) ="\n" while (line := tab(find("\n"))) do { #write("reading line ", image(line)) if (not find("", line)) then { put(inqueue, line) } else { break } move(1) } put(inqueue, line) inqueue @>> myanalyst inqueue := [ ] } if not ="\n" then break } write("finished main/outer while loop") } &null @>> mywriter &null @>> myanalyst wait(myanalyst) wait(mywriter) close(fin) close(fout) close(fdup) end