/* File is constructed in perfrun.hoc by printing the tperf for rank 0
using p_perf where the vector tperf is retrieved after a psolve via
	tdat_.x[5] = pnm.pc.send_time(4, chist, tperf)
The first item in the file after the header is the size of tperf
See bgpdma.cpp nrn_bgp_receive_time for the meaning of the first arg. In
particular, 4 returns the number of extra conservation checks, fills chist
with the histogram of number of intervals using 0-11 conservation checks,
and fills tperf with itbuf items with the itbuf+1 item being the DCMF_Tick().
The items are in sets of DCMF_Timebase() as indicated below.
It seems itbuf is initialized only at launch and not in bgp_dma_init (which
would allow multiple runs per launch).
All the TBUF fills take place in bgp_dma_receive
Given the several multisend methods we are using and the variation in
network parameters, it is a good idea to prepend run information to the file
*/
// allgather 201 (usually) sets of
// 0 enter nrn_spike_exchange_compressed
// 1 after barrier
// 2 after nrnmpi_spike_exchange_compressed (allgather, allgatherv)
// 3 #sent (cells that spiked)
// 4 #received
// 5 exit nrn_spike_exchange_compressed (after find and enqueue)

// multisend 401 (usually)  sets of
// 0 enter bgp_dma_receive
// 1 after DCMF_Messager_advance (or bgp_advance)
// 2 after barrier
// 3 after conserve
// 4 ncons
// 5 #cells_that_fired
// 6 #sent (cell spiked * fanout)
// 7 #received
// 8 # cumulative spike send time (BGP_DMASend::send
// if ENQUEUE == 1 then 9 incoming spikes are saved in buffer, TBUF, then to queue
// if ENQUEUE == 2 then 9 and 10 are find and enqueue times for the interval
//    but do not include time during  barrier and conserve
// 9 + ENQUEUE exit bgp_dma_receive

{load_file("nrngui.hoc")}
objref td[1], vmore[1]
strdef fname
proc rd() {local i, j, k, n, size, toff, tm, methodinfo \
  localobj f, indices, s
	f = new File()
	f.chooser("r", "Read", "*.*")
	if (numarg() == 1) {
		fname = $s1
	}else{
		if (!f.chooser()){
			quit()
		}
		fname = f.getname()
	}
	f.ropen(fname)
	// read header
	s = new String()
	f.gets(s.s) printf("%s", s.s)
	nver=0
	sscanf(s.s, "%*[^(](%d)", &nver)	
	f.gets(s.s) printf("%s", s.s)
	if (nver >= 534) { f.gets(s.s) printf("%s", s.s) }
	methodinfo = f.scanvar()
	method = methodinfo%4
	enqueue=int(methodinfo/32)
	phase2=int(methodinfo/8)%2
	if (method == 0) { enqueue = 0 }
	n_bgp_interval = (int(methodinfo/4)%2) + 1
printf("method=%d n_bgp_interval=%g enqueue=%g phase2=%g\n", method, n_bgp_interval,enqueue,phase2)
	nhost = f.scanvar()
	printf("nhost=%d\n", nhost)
		f.gets(s.s) printf("%s", s.s)
		f.gets(s.s) printf("%s", s.s)
		f.gets(s.s) printf("%s", s.s)
	while(1) {
		f.gets(s.s) printf("%s", s.s)
		if (strcmp(s.s,"@@@end header@@@\n") == 0) { break }
		sscanf(s.s, "%[^\n]", s.s)
		execute(s.s)
	}

	indices = new Vector()
	if (nver < 519) {
		nd = 8 + (enqueue == 1)
		indices.append(0,1,2,3)
	}else if (nver < 523){
		nd = 9 + (enqueue == 1)
		indices.append(0,1,2,3)
	}else{
		if (method == 0) {
			nd = 6
			indices.append(0,1,2,5)
		}else{
			if (nver < 544) {
				nd = 10 + enqueue
			}else{
				nd = 10 + enqueue + 2*phase2
			}
			indices.append(0,1,2,3,nd-1)
			if (enqueue == 1) {
				indices.append(nd-2)
			}
		}
	}
	objref td[nd]

	// read data
	size = (f.scanvar()-1)/nd
	print size
	for i=0, nd-1 {td[i] = new Vector(size)}
	for i = 0, size-1 {
		for j=0,nd-1 {
			td[j].x[i] = f.scanvar()
		}
	}
	ts = f.scanvar()

	nmore = f.scanvar()
	if (nmore > 0) objref vmore[nmore]
	for i=0, nmore-1 {
		n = f.scanvar()
		vmore[i] = new Vector(n)
		vmore[i].scanf(f, n)
	}

	tm = 2^32
	toff = td[0].x[0]
	for k=0, indices.size-1 {
		j = indices.x[k]
		// all vectors start at 0
		td[j].sub(toff)
		// take wrap around into account so vectors increase monotonically
		for i=1, size-1 {
			while (td[j].x[i] < td[j].x[i-1]) {
				td[j].x[i] += tm
			}
		}
		//td[j].mul(ts)
	}
	print ts
	f.close	
}
//rd(1)

obfunc combine() { local i localobj x
	x = new Vector()
printf("combine size=%d\n", $o1.size)
	for (i=0; i < $o1.size; i += 2) {
		x.append($o1.x[i] + $o1.x[i+1])
	}
	return x
}

objref g, g1, g2, yy[7], name[7]
proc anal() {local i, j, a, b, n, c localobj tt, xx, s1
	// for multisend
	// yy[0] is from after conserve (3) to before barrier (1). i.e
	// overlap between computation and spike exchange and enqueing
	// yy[1] is from after conserve to next after conserve, i.e.
	// the total interval time.
	// for allgather, yy[0] is from after exchange(2) to enter exchange(0)
	// yy[1] is from enter to enter
	for i=0, 1 yy[i] = new Vector()
	if (method == 0) {
		for i=0, td[2].size-2 yy[0].append(td[0].x[i+1] - td[2].x[i])
		for i=0, td[0].size-2 yy[1].append(td[0].x[i+1] - td[0].x[i])
	}else{
		for i=0, td[3].size-2 yy[0].append(td[1].x[i+1] - td[3].x[i])
		for i=0, td[1].size-2 yy[1].append(td[3].x[i+1] - td[3].x[i])
	}
	for i=0, 1 yy[i].div(1e6)
	name[0] = new String("interval compute")
	name[1] = new String("total interval")
}

proc gr() {local i, j, a, b, n, c localobj tt, xx, s1
	g1 = new Graph(0)
	g1.view(0,0,200,12, 100, 100, 500, 250)
	//g2 = new Graph()
	g1.size(0,200,0,15)
	//g2.size(0,40,0,6e6)
	g1.label(.3, .9, fname, 2)
	//g2.label(.5, .9, fname, 2)
	g = g1
	anal()
	n = yy[0].size
	if (n > 300) {
		g1.size(0,400,0,6)
	}else{
		g1.size(0,200,0,12)
	}

	for case (&i, 0, 1) {
		labl(yy[i], i)
		c = i+1
		if (nmore > 0) {
			if (i == 0) j = 1 else j = 0
			vmore[j*2].c.div(1e6).div(nhost).line(g, c, 3)
			vmore[j*2+1].c.div(1e6).line(g, c, 6)
		}
		yy[i].line(g, c, 1)
	}
}

proc labl() {local x  localobj s
	s = new String()
	x = $o1.sum*ts*1e6
	printf("%3.2fs %s\n", x, name[$2].s)
	sprint(s.s, "%3.2fs %s", x, name[$2].s)
	g1.label($o1.size*1.02, $o1.c($o1.size-101, $o1.size-1).mean,s.s,1,1,0,.5,1)
}
//gr()

objref list
list = new List()
proc again() {
	list.append(g1) list.append(g2)
	rd()
	gr()
}

proc ag() {
	list.append(g1) list.append(g2)
	rd($s1)
	gr()
}

objref flist
flist = new List()

proc fchoose() {local i, n  localobj s, po
	s = new String()
	system("cd dat ; ls t*", s.s)
	po = new PythonObject()
	po.s = s.s
	nrnpython("fnames = s.splitlines()")
	n = po.len(po.fnames)
	for i=0, n-1 {
		flist.append(new String(po.fnames._[i]))
	}
	flist.browser("Choose File", "s")
	flist.select_action("select(hoc_ac_)")
}
proc select() {localobj s
	if ($1 < 0) { return }
	s = new String()
	sprint(s.s, "dat/%s", flist.o($1).s)
	ag(s.s)
}
fchoose()

/*
ag("nrn-results/1kconn/results_256_8/t0perf18.1k.8K.dat")
ag("nrn-results/1kconn/results_256_8/t2perf18.1k.8K.dat")
ag("nrn-results/1kconn/results_256_8/t6perf18.1k.8K.dat")

ag("nrn-results/1kconn/results_256_16/t0perf18.0.16384.dat")
ag("nrn-results/1kconn/results_256_16/t2perf18.0.16384.dat")
ag("nrn-results/1kconn/results_256_16/t6perf18.0.16384.dat")

ag("nrn-results/1kconn/results_256_32/t0perf18.0.32768.dat")
ag("nrn-results/1kconn/results_256_32/t2perf18.0.32768.dat")
ag("nrn-results/1kconn/results_256_32/t6perf18.0.32768.dat")
*/

proc perf_t2_1k() {
	ag("dat/t2perf22.1k.16K.86.550.0.0")
	ag("dat/t2perf22.1k.16K.94.550.0.0")
	ag("dat/t2perf22.1k.16K.82.551+.0.0")
	ag("dat/t2perf22.1k.16K.90.552.0.0")
}

proc perf_t2_10k() {
	ag("dat/t2perf21.10k.64K.86.549+.7.0")
	ag("dat/t2perf21.10k.64K.94.549+.10.0")
	ag("dat/t2perf21.10k.64K.82.552.0.0")
	ag("dat/t2perf21.10k.64K.90.549+.31.0")
}

proc perf_t3() {
	ag("dat/t3perf24.1k.64K.83.549+.24.0")
	ag("dat/t3perf24.1k.64K.87.542.0")
	ag("dat/t3perf21.10k.64K.83.549+.32.0")
	ag("dat/t3perf21.10k.64K.87.549+.6.0")
}

proc perf_t2_greedy_enqueue() {
ag("debug/t1perf17.1k.512.93.542.1.0")
}

proc table() {local i  localobj fnl, sl, r, m, f
	sl = new StringFunctions()
	r = new String()
	fnl = new List()
	for i=0, flist.count-1 {
		if (sl.head(flist.o(i).s, "54[789].*\\.0$", r.s) != -1) {
			fnl.append(flist.o(i))
		}else if (sl.head(flist.o(i).s, "551.*\\.0$", r.s) != -1) {
			fnl.append(flist.o(i))
		}
	}
	//ncell ncon nhost meth invl phase comp total
	m = new Matrix(fnl.count, 9)
	chdir("dat")
	for i=0, fnl.count-1 {
		print fnl.o(i).s
		rd(fnl.o(i).s)
		anal()
		m.x[i][0] = 2^ncellpow
		m.x[i][1] = ncon
		m.x[i][2] = nhost
		m.x[i][3] = method
		m.x[i][4] = n_bgp_interval
		m.x[i][5] = phase2
		m.x[i][6] = yy[0].sum*ts*1e6
		m.x[i][7] = yy[1].sum*ts*1e6
		m.x[i][8] = set_maxstep_time
	}
	chdir("..")
m.printf("  %4.2f")
	f = new File()
	f.wopen("table.dat")
	m.fprint(f, " %.20g")
	f.close()
}
//table()