# # Concurrent toxline dedup. # Usage: ./t2 [-n=nthreads] datfile [outfile] # # # Need a (multi-platform) way to detect # of cores present! # $define DEFAULT_NTHREADS 64 global nthreads class doc(docno, queue, # [...] list of strings text, knowndup) end class threaddata(id, # thread id schunk, # substring this thread works on Ls, # list of string lines Lqueue, # list of doc objects Sdocnos, # set of its docnos dupsfound ) method computeLs() Ls := [] Lqueue := [] Sdocnos := set() #write("thread ", id, " ", schunk[1:20], "...", schunk[-20:0]) schunk ? { every i := find("\n",schunk) do { line := tab(i) if find(" 1900", line) then line := replacem(line, " 1900", " 2000") move(1) put(Ls,line) if find("",line) then { queue := [] recstartindex := i-*line put(queue, line) } else if find("",line) then { put(queue, line) recendindex := i # this doesn't seem to be happening, but I worry if /docno then { stopstr := "uh oh, null docno!\n" every stopstr ||:= !queue do stopstr ||:= "\n" stop(stopstr) } put(Lqueue, doc(docno, queue, schunk[recstartindex:recendindex], if member(Sdocnos, docno) then { write("wow parallel founddup") dupsfound+:=1 1 } else { insert(Sdocnos, docno); &null } )) queue := &null docno := &null } else { if \queue then { put(queue, line) line? if tab(find("")+7) then { docno := tab(find("")) } } else if *line >0 then write("discarded ", image(line)) } } srest := tab(0) if *srest > 0 then { put(Ls, srest) } } end initially dupsfound := 0 end global wholefile, fout, fdup, docnos procedure main(av) if match("-n=", av[1]) then { nthreads := integer(av[1][4:0]) pop(av) } else nthreads := DEFAULT_NTHREADS write("super-toxline-dedup startup, using ", nthreads, " threads") docnos := set() sz := stat(av[1]).size fin := open(av[1], "r") | stop("Cannot open " || av[1]) write("size is ", sz) t1 := &time count := 0 wholefile := reads(fin,sz) t2 := &time write("read ", *wholefile, " bytes in ", t2 - t1, "ms") close(fin) fout := open(av[2] | (av[1]||".out"), "w") | stop("Cannot open " || av[2]) fdup := open("toxline.dups", "w") | stop("Cannot open for write: toxline.dups.") Ltd := list(nthreads) chunksize := *wholefile / nthreads startindex := 1 every i := 1 to nthreads-1 do { endindex := startindex+chunksize endindex := find("",wholefile,endindex)+6 Ltd[i] := threaddata(i, wholefile[startindex : endindex]) startindex := endindex } Ltd[nthreads] := threaddata(nthreads, wholefile[startindex : 0]) Lthr := list(nthreads) every i := 1 to nthreads do Lthr[i] := thread Ltd[i].computeLs() every wait(!Lthr) sumlines := 0 every sumlines +:= *((!Ltd).Ls) nrecs := 0 every nrecs +:= *((!Ltd).Lqueue) t3 := &time write("computed ", sumlines, " lines, ", nrecs, " records in ", t3 - t2,"ms") # 282,000 times... n := 0 out_to_fout := "" out_to_fdup := "" every rec := !((!Ltd).Lqueue) do { n +:= 1 if n % 1000 = 0 then { write("dup/out ", n) writes(fdup,out_to_fdup) writes(fout,out_to_fout) out_to_fout := "" out_to_fdup := "" } if member(docnos, rec.docno) then { out_to_fdup ||:= rec.docno out_to_fdup ||:= "\n" } else { insert(docnos, rec.docno) out_to_fout ||:= rec.text out_to_fout ||:= "\n" } } writes(fdup,out_to_fdup) writes(fout,out_to_fout) end