Description
LU measures the floating point rate of execution for solving a linear system of equations.
The parameters of the implemented programs are:
- M the size of the matrix, that is, the matrix is M by M.
- B the block size,
- px*py is the processor grid
Performance
P = N * T | M | B | px | py | GFlops | GFlops |
2009 | 2008 | |||||
16 = 1 * 16 | 45600 | 200 | 4 | 4 | 92.07 | 67.45 |
32 = 2 * 16 | 64000 | 200 | 4 | 8 | 173.11 | 123.08 |
64 = 4 * 16 | 91200 | 200 | 8 | 8 | 354.32 | 263.56 |
128 = 8 * 16 | 128000 | 200 | 8 | 16 | 666.03 | 529.79 |
256 = 16 * 16 | 184320 | 256 | 16 | 16 | 1268.00 | 1004.85 |
- P = number of processes
- N = number of nodes
- T = number or processes per node
Results obtained on 10-25-2009 using:
- X10 11743
- LU 11751
- SMT on, binding on
LU Source Files
LU Log Files
LU Code
LU.x10
package rc7; import x10.compiler.Native; import x10.runtime.PlaceLocalHandle; import x10.runtime.PlaceLocalStorage; class LU { @Native("c++", "blockTriSolve((#1)->raw(), (#2)->raw(), #3)") native static def blockTriSolve(me:Rail[Double], diag:Rail[Double], B:Int):Void; @Native("c++", "blockBackSolve((#1)->raw(), (#2)->raw(), #3)") native static def blockBackSolve(me:Rail[Double], diag:Rail[Double], B:Int):Void; @Native("c++", "blockMulSub((#1)->raw(), (#2)->raw(), (#3)->raw(), #4)") native static def blockMulSub(me:Rail[Double], left:Rail[Double], upper:Rail[Double], B:Int):Void; @Native("c++", "blockMulSubRow((#1)->raw(), (#2)->raw(), #3, #4, #5)") native static def blockMulSubRow(me:Rail[Double], diag:Rail[Double], B:Int, j:Int, cond:boolean):Void; static def runAt(id:Int, c:()=>Void) { x10.runtime.NativeRuntime.runAt(id, c); x10.runtime.NativeRuntime.dealloc(c); } const unique = Dist.makeUnique(); val M:Int; val N:Int; val B:Int; val MB:Int; val NB:Int; val px:Int; val py:Int; val A:PlaceLocalHandle[BlockedArray]; val A_here:BlockedArray!; val world:Comm!; val col:Comm!; val row:Comm!; var ready:Boolean; val pivot:Rail[Int]!; val rowForBroadcast:Rail[Double]!; val rowBuffer:Rail[Double]!; val colBuffer:Rail[Double]!; val colBuffers:ValRail[Rail[Double]!]; val rowBuffers:ValRail[Rail[Double]!]; val buffer:Rail[Double]!; val buffers:PlaceLocalHandle[Rail[Double]]; def computeRowSum() { val sum = Rail.makeVar[Double](B); for (var I:Int = 0; I <= MB; ++I) if (A_here.hasRow(I)) { val IB = I * B; for (var k:Int = 0; k < B; ++k) sum(k) = 0.0; for (var J:Int = 0; J <= MB; ++J) if (A_here.hasCol(J)) { val JB = J * B; val b = A_here.block(I, J); for (var i:Int = 0; i < B; ++i) { for (var j:Int = 0; j < B; ++j) { sum(i) += b(IB + i, JB + j); } } } for (var k:Int = 0; k < B; ++k) { sum(k) = row.sum(sum(k)); if (A_here.hasCol(NB)) A_here(IB + k, M) = sum(k); } } } def computeLocalMax(J:Int, LUCol:Int) { var max:Double = 0; var id:Int = -1; for (var I:Int = J; I <= MB; ++I) if (A_here.hasRow(I)) { val IB = I * B; val b = A_here.block(I, LUCol / B); for (var i:Int = Math.max(IB, LUCol); i < IB + B; i++) { if (Math.abs(b(i, LUCol)) >= Math.abs(max) || id == -1) { max = b(i, LUCol); id = i; } } } return col.indexOfAbsMax(max, id); } def exchange(row:Int, row2:Int, min:Int, max:Int, dest:Int) { val source = here; ready = false; val size = A_here.getRow(row, min, max, buffer); val _buffers = buffers; val _A = A; buffers.copyTo(Place.places(dest), size, ()=>{ val size = _A.get().swapRow(row2, min, max, _buffers.get()); _buffers.copyTo(source, size, ()=>{ A_here.setRow(row, min, max, buffer); atomic ready=true; }); }); await ready; } def panel(J:Int, timer:Timer!) { val A_panel_j = A_here.blocks(J, MB, J, J); val A_ext_panel_j = A_here.blocks(0, MB, J, J); if (!A_ext_panel_j.empty()) { var n:Int = 0; val LUColStart:Int = J * B; for (var LUCol:Int = LUColStart; LUCol<LUColStart + B; LUCol++) { timer.start(5); val row2:Int = computeLocalMax(J, LUCol); timer.stop(5); timer.start(6); if (A_here.hasBlock(J, J)) { val row = LUCol; pivot(n) = row2; if (row2 != row) { val dest = A_here.placeOf(row2, LUCol); if (dest == here.id) { val b0 = A_ext_panel_j.blockOf(row, LUCol); val b1 = A_ext_panel_j.blockOf(row2, LUCol); for (var j:Int = J*B; j < J*B + B; ++j) { var tmp:Double = b0(row, j); b0(row, j) = b1(row2, j); b1(row2, j) = tmp; } } else { exchange(row, row2, LUColStart, LUColStart + B - 1, dest); } } val block = A_here.block(J, J); for (var i:Int = 0; i < B; ++i) rowForBroadcast(i) = block(LUCol, J*B+i); } timer.stop(6); timer.start(7); col.broadcast_d(rowForBroadcast, J%px); timer.stop(7); if(!A_panel_j.empty()) { timer.start(8); for (var I:Int = A_panel_j.min_x; I <= A_panel_j.max_x; I += px) { blockMulSubRow(A_here.block(I, J).raw, rowForBroadcast, B, LUCol - J * B, I == J); } timer.stop(8); } n++; } } } def swapRows(J:Int) { row.broadcast(pivot, J%py); val row_panel = A_here.blocks(J, J, J + 1, NB); if (!row_panel.empty()) { var n:Int = 0; for (var row:Int = J * B; row < (J + 1) * B; ++row) { val row2 = pivot(n++); if (row2 == row) continue; val dest = A_here.placeOf(row2, row_panel.min_y * B); if (dest == here.id) { for (var j:Int = (J + 1) * B; j < N; j += B) { if (A_here.placeOf(row, j) == here.id) { val b1 = A_here.blockOf(row, j); val b2 = A_here.blockOf(row2, j); for (var k:Int = j; k < j + B; ++k) { var tmp:Double = b1(row, k); b1(row, k) = b2(row2, k); b2(row2, k) = tmp; } } } } else { exchange(row, row2, (J + 1) * B, N - 1, dest); } } } } def triSolve(J:Int) { if (A_here.hasRow(J)) { val diag = A_here.hasCol(J) ? A_here.block(J, J).raw : colBuffer; row.broadcast_d(diag, J%py); for (var cj:Int = J + 1; cj <= NB; ++cj) if (A_here.hasCol(cj)) { blockTriSolve(A_here.block(J, cj).raw, diag, B); } } } def update(J:Int) { val A_U = A_here.blocks(0, MB, J + 1, NB); if (!A_U.empty()) { for (var cj:Int = A_U.min_y; cj <= A_U.max_y; cj += py) { val block = A_here.hasBlock(J, cj) ? A_U.block(J, cj).raw : colBuffers(cj/py); col.broadcast_d(block, J%px); } } world.barrier(); val A_L = A_here.blocks(J + 1, MB, 0, NB); if (!A_L.empty()) { for (var ci:Int = A_L.min_x; ci <= A_L.max_x; ci += px) { val block = A_here.hasBlock(ci, J) ? A_L.block(ci, J).raw : rowBuffers(ci/px); row.broadcast_d(block, J%py); } } world.barrier(); val A_trail = A_here.blocks(J + 1, MB, J + 1, NB); if (!A_trail.empty()) { for (var ci:Int = A_trail.min_x; ci <= A_trail.max_x ; ci += px){ for (var cj:Int = A_trail.min_y; cj <= A_trail.max_y; cj += py) { val left = A_here.hasCol(J) ? A_L.block(ci, J).raw : rowBuffers(ci/px); val upper = A_here.hasRow(J) ? A_U.block(J, cj).raw : colBuffers(cj/py); blockMulSub(A_trail.block(ci, cj).raw, left, upper, B); } } } } def solve(timer:Timer!) { progressInc:Int = 2; var nextJ:Int = progressInc; computeRowSum(); world.barrier(); timer.start(9); for (var J:Int = 0; J < NB; J++){ timer.start(1); panel(J, timer); world.barrier(); timer.stop(1); timer.start(2); swapRows(J); world.barrier(); timer.stop(2); timer.start(3); triSolve(J); world.barrier(); timer.stop(3); timer.start(4); if (J != NB - 1) update(J); world.barrier(); timer.stop(4); /* Progress meter */ if(0 == here.id && J > nextJ) { timer.stop(9); Console.OUT.println(J + " of " + NB + " complete " + "last " + progressInc + " iterations took " + (timer.total(9) as Double)/1e9 + " seconds"); nextJ += progressInc; timer.clear(9); timer.start(9); } } } def memget(I:Int, J:Int) { if (A_here.hasBlock(I, J)) { return A_here.block(I, J).raw; } else { val source = here; ready = false; val _A = A; val _rowBuffer = rowBuffer; val _B = B; runAt(A_here.placeOfBlock(I, J), ()=>{ _A.get().block(I, J).raw.copyTo(0, _rowBuffer, 0, _B * _B, ()=>{ atomic ready=true; }); }); await ready; return rowBuffer; } } def backsolve() { val A_last_panel = A_here.blocks(0, MB, NB, NB); if (!A_last_panel.empty()) { for (var I:Int = MB; I >= 0; --I) { if (A_here.hasRow(I)) { blockBackSolve(A_here.block(I, NB).raw, memget(I, I), B); } val bufferY = A_here.hasRow(I) ? A_here.block(I, NB).raw : colBuffer; col.broadcast_d(bufferY, I%px); for (var ci:Int = 0; ci < I; ++ci) if (A_here.hasRow(ci)) { blockMulSub(A_here.block(ci, NB).raw, memget(ci, I), bufferY, B); } col.barrier(); } } world.barrier(); } def check() { var max:Double = 0.0; for (var i:Int = 0; i < M; ++i) { if (A_here.placeOf(i, M) == here.id) { val v = 1.0 - A_here(i, M); max = Math.max(max, v * v); } } Console.OUT.println("diff " + max + " " + here.id); return col.max(max); } public static def main(args:Rail[String]!) { if (args.length < 4) { Console.OUT.println("Usage: LU M B (px py)"); Console.OUT.println("M = Matrix size,"); Console.OUT.println("B = Block size, where B should perfectly divide M"); Console.OUT.println("px py = Processor grid, where px*py = nplaces"); return; } val M = Int.parseInt(args(0)); val B = Int.parseInt(args(1)); val N = M + B; val px = Int.parseInt(args(2)); val py = Int.parseInt(args(3)); val A = BlockedArray.make(M, N, B, B, px, py); val buffers = PlaceLocalStorage.createDistributedObject[Rail[Double]](unique, ()=>Rail.makeVar[Double](N)); val lus = PlaceLocalStorage.createDistributedObject[LU](unique, ()=>new LU(M, N, B, px, py, A, buffers)); Console.OUT.println ("LU Starting: M " + M + " B " + B + " px " + px + " py " + py); start(lus); } def this(M:Int, N:Int, B:Int, px:Int, py:Int, A:PlaceLocalHandle[BlockedArray], buffers:PlaceLocalHandle[Rail[Double]]) { this.M = M; this.N = N; this.B = B; this.px = px; this.py = py; this.A = A; A_here = A.get(); this.buffers = buffers; buffer = buffers.get(); MB = M / B - 1; NB = N / B - 1; val world = Comm.WORLD(); this.world = world; val pi = here.id / py; val pj = here.id % py; col = world.split(pj, pi); row = world.split(pi, pj); pivot = Rail.makeVar[Int](B); rowForBroadcast = Rail.makeVar[Double](B); val rowBuffers = Rail.makeVal[Rail[Double]](M / B / px + 1, (Int)=>Rail.makeVar[Double](B * B)); val colBuffers = Rail.makeVal[Rail[Double]](N / B / py + 1, (Int)=>Rail.makeVar[Double](B * B)); this.rowBuffers = rowBuffers; this.colBuffers = colBuffers; rowBuffer = rowBuffers(0); colBuffer = colBuffers(0); } static def start(lus:PlaceLocalHandle[LU]) { Console.OUT.println("Main loop starting ..."); var t:Long = -System.nanoTime(); finish ateach (p in unique) { val lu = lus.get(); val timer = new Timer(10); timer.start(0); lu.solve(timer); lu.backsolve(); val r = lu.check(); timer.stop(0); if (here.id == 0) { Console.OUT.println ("difference " + r); Console.OUT.println(((r < 0.01?" ok)":" fail ") + " diff=" + r)); Console.OUT.println ("Timer(0) TOTAL #invocations=" + timer.count(0) + " Time=" + (timer.total(0) as Double) / 1e9 + " seconds"); Console.OUT.println ("Timer(1) PANEL #invocations=" + timer.count(1) + " Time=" + (timer.total(1) as Double)/1e9 + " seconds"); Console.OUT.println ("Timer(2) SWAPROWS #invocations=" + timer.count(2) + " Time=" + (timer.total(2) as Double)/1e9 + " seconds"); Console.OUT.println ("Timer(3) TRISOLVE #invocations=" + timer.count(3) + " Time=" + (timer.total(3) as Double)/1e9 + " seconds"); Console.OUT.println ("Timer(4) UPDATE #invocations=" + timer.count(4) + " Time=" + (timer.total(4) as Double)/1e9 + " seconds"); Console.OUT.println ("Timer(5) PANEL-PIVOT #invocations=" + timer.count(5) + " Time=" + (timer.total(5) as Double)/1e9 + " seconds"); Console.OUT.println ("Timer(6) PANEL-SWAP #invocations=" + timer.count(6) + " Time=" + (timer.total(6) as Double)/1e9 + " seconds"); Console.OUT.println ("Timer(7) PANEL-BCAST #invocations=" + timer.count(7) + " Time=" + (timer.total(7) as Double)/1e9 + " seconds"); Console.OUT.println ("Timer(8) PANEL-UPDATE #invocations=" + timer.count(8) + " Time=" + (timer.total(8) as Double)/1e9 + " seconds"); } } t += System.nanoTime(); Console.OUT.println(); Console.OUT.println(" Time= "+ t/1e9 + " seconds" + " Rate= " + flops(lus.get().N)/t + " GFlops"); } static def flops(n:Int) = ((4.0*n-3.0)*n-1.0)*n/6.0; }