[RFA] better igc up-to-date-ness tracking, optimized igc communication
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[RFA] better igc up-to-date-ness tracking, optimized igc communication



As a proof of concept the attached patch implements more accurate tracking
of which igc cells are up-to-date to reduce communication.  It also
implements a new GuardSend/Receive iterate for the POOMA_MPI case which
is able to merge consecutive igc updates by using a global std::map for
bookkeeping (yay!).

A new test Array/tests/array_test30.cpp tries to completely cover all
possible cases of igc updates and passes in serial mode and in MPI mode
for 1 process and #processes == #patches.  For mixed remote/local updates
there seems to be a problem, thus this is RFA and not a patch submission.

Please look at it, tell me how to avoid the global std::map (or if it is
ok), test it with your favorite application, etc.  I'll be away for
holidays for a few weeks now.

Richard.


2004Jul23  Richard Guenther <richard.guenther@xxxxxxxxxxxxxxxx>

	* src/Engine/MultiPatchEngine.h: abstract the dirty flag
	and record partial up-to-date-ness via a GuardLayer object.
	(setDirty): for POOMA_MPI mode clear the global hash.
	src/Engine/MultiPatchEngine.cpp: handle the abstract dirty
	flag.
	(simpleAssign): add optimized mode using the new GuardSend
	and GuardReceive iterates for POOMA_MPI.
	(fillGuardsHandler): honour partial up-to-date information,
	produce partial update domains.
	src/Layout/GuardLayers.h: abstract from the data-type,
	add operator>=.
	src/Threads/IterateSchedulers/SerialAsync.h
	(waitForSomeRequests): fix bug for non-blocking operation.
	(runSomething): always complete complete message iterates.
	src/Tulip/SendReceive.h: add GuardSend and GuardReceive
	iterates that are able to merge with consecutive iterates
	using a global std::map.
	src/Array/tests/array_test30.cpp: new.
===== src/Engine/MultiPatchEngine.h 1.12 vs edited =====
--- 1.12/r2/src/Engine/MultiPatchEngine.h	2004-01-29 12:01:35 +01:00
+++ edited/src/Engine/MultiPatchEngine.h	2004-07-23 13:34:34 +02:00
@@ -659,27 +659,36 @@
 
   inline void setDirty() const
   {
-    *pDirty_m = (1<<(Dim*2))-1;
+    pDirty_m->setDirty();
+
+#if POOMA_MPI
+    // We need to clear the GuardSend and GuardReceive iterate map for
+    // all DataObjects in this engine.
+    // Possibly instead of the O(n^2) looking loop we could loop over
+    // the guard fill list and identify all possible communication
+    // iterates that way.  For later optimization.
+    typedef std::pair<int, Pooma::DataObject_t*> GIMKey_t;
+    typedef std::map<GIMKey_t, Pooma::Iterate_t*> GIMap_t;
+    extern GIMap_t guardSendIterateMap_g;
+    extern GIMap_t guardReceiveIterateMap_g;
+    for (int i=0; i<layout().sizeLocal(); ++i) {
+      for (int j=0; j<Pooma::contexts(); ++j) {
+	GIMKey_t key(j, localPatch(i).localEngine().dataObject());
+	guardSendIterateMap_g.erase(key);
+	guardReceiveIterateMap_g.erase(key);
+      }
+    }
+#endif
   }
 
-  inline void clearDirty(int face = -1) const
+  inline void clearDirty() const
   {
-    if (face == -1)
-      *pDirty_m = 0;
-    else {
-      PAssert(face >= 0 && face <= Dim*2-1);
-      *pDirty_m &= ~(1<<face);
-    }
+    pDirty_m->clearDirty();
   }
 
-  inline bool isDirty(int face = -1) const
+  inline bool isDirty() const
   {
-    if (face == -1)
-      return *pDirty_m != 0;
-    else {
-      PAssert(face >= 0 && face <= Dim*2-1);
-      return *pDirty_m & (1<<face);
-    }
+    return pDirty_m->isDirty();
   }
 
   //============================================================
@@ -879,6 +888,36 @@
     
   };
 
+  /// Opaque type for the dirty flag
+
+  struct DirtyFlag
+  {
+    void setDirty()
+    {
+      dirty_m = true;
+      clean_m = GuardLayers<Dim>(0);
+    }
+    void clearDirty()
+    {
+      dirty_m = false;
+      clean_m = GuardLayers<Dim>(255);
+    }
+    bool isDirty()
+    {
+      return dirty_m;
+    }
+    int upToDate(int face) const
+    {
+      return face&1 ? clean_m.upper(face/2) : clean_m.lower(face/2);
+    }
+    typename GuardLayers<Dim>::Element_t& upToDate(int face)
+    {
+      return face&1 ? clean_m.upper(face/2) : clean_m.lower(face/2);
+    }
+    GuardLayers<Dim> clean_m;
+    bool dirty_m;
+  };
+
   //===========================================================================
   // Data
   //===========================================================================
@@ -896,7 +935,7 @@
   /// must share the same flag. We use the reference count in
   /// data_m to decide whether to clean this up.
 
-  int *pDirty_m;
+  DirtyFlag *pDirty_m;
 };
 
 
@@ -1245,14 +1284,14 @@
     baseEngine_m.setDirty();
   }
 
-  inline void clearDirty(int face=-1) const
+  inline void clearDirty() const
   {
-    baseEngine_m.clearDirty(face);
+    baseEngine_m.clearDirty();
   }
   
-  inline bool isDirty(int face=-1) const
+  inline bool isDirty() const
   {
-    return baseEngine_m.isDirty(face);
+    return baseEngine_m.isDirty();
   }
 
   //---------------------------------------------------------------------------
===== src/Engine/MultiPatchEngine.cpp 1.12 vs edited =====
--- 1.12/r2/src/Engine/MultiPatchEngine.cpp	2004-01-29 12:06:22 +01:00
+++ edited/src/Engine/MultiPatchEngine.cpp	2004-07-23 13:17:28 +02:00
@@ -79,12 +79,10 @@
 Engine(const Layout_t &layout)
   : layout_m(layout),
     data_m(layout.sizeGlobal()),
-    pDirty_m(new int)
+    pDirty_m(new DirtyFlag)
 {
   typedef typename Layout_t::Value_t Node_t;
 
-  setDirty();
-
   // check for correct match of PatchTag and the mapper used to make the
   // layout.
   // THIS IS A HACK! we test on the context of the first patch, and if it
@@ -135,6 +133,8 @@
   // Attach ourself to the layout so we can receive messages.
   
   layout_m.attach(*this);
+
+  setDirty();
 }
 
 
@@ -250,7 +250,7 @@
 {
   if (data_m.isValid() && data_m.isShared()) {
     data_m.makeOwnCopy();
-    pDirty_m = new int(*pDirty_m);
+    pDirty_m = new DirtyFlag(*pDirty_m);
   }
 
   return *this;
@@ -264,88 +264,134 @@
 //
 //-----------------------------------------------------------------------------
 
+#if POOMA_MPI
+
 /// Guard layer assign between non-remote engines, just use the
-/// ET mechanisms
+/// ET mechanisms on the optimized domain
 
 template <int Dim, class T, class Tag>
 static inline
 void simpleAssign(const Array<Dim, T, Tag>& lhs,
 		  const Array<Dim, T, Tag>& rhs,
-		  const Interval<Dim>& domain)
+		  const Interval<Dim>&,
+		  const Interval<Dim>& domain2,
+		  bool optimize)
 {
-  lhs(domain) = rhs(domain);
+  lhs(domain2) = rhs(domain2);
 }
 
 /// Guard layer assign between remote engines, use Send/Receive directly
 /// to avoid one extra copy of the data.
+/// Uses domain2 for local->local copies and domain for copies involving
+/// remote engines as Send/Receive requests get merged downway.
 
 template <int Dim, class T, class Tag>
 static inline
 void simpleAssign(const Array<Dim, T, Remote<Tag> >& lhs,
 		  const Array<Dim, T, Remote<Tag> >& rhs,
-		  const Interval<Dim>& domain)
+		  const Interval<Dim>& domain,
+		  const Interval<Dim>& domain2,
+		  bool optimize)
 {
-  if (lhs.engine().owningContext() == rhs.engine().owningContext())
-    lhs(domain) = rhs(domain);
-  else {
-    typedef typename NewEngine<Engine<Dim, T, Tag>, Interval<Dim> >::Type_t ViewEngine_t;
-    if (lhs.engine().engineIsLocal())
-      Receive<ViewEngine_t>::receive(ViewEngine_t(lhs.engine().localEngine(), domain),
-				     rhs.engine().owningContext());
-    else if (rhs.engine().engineIsLocal())
-      SendReceive::send(ViewEngine_t(rhs.engine().localEngine(), domain),
+  if (lhs.engine().owningContext() == rhs.engine().owningContext()) {
+    PAssert(lhs.engine().engineIsLocal() && rhs.engine().engineIsLocal());
+    Array<Dim, T, Tag> llhs, lrhs;
+    llhs.engine() = lhs.engine().localEngine();
+    lrhs.engine() = rhs.engine().localEngine();
+    llhs(domain2) = lrhs(domain2);
+  } else {
+    if (!optimize) {
+      typedef typename NewEngine<Engine<Dim, T, Tag>, Interval<Dim> >::Type_t ViewEngine_t;
+      if (lhs.engine().engineIsLocal())
+	Receive<ViewEngine_t>::receive(ViewEngine_t(lhs.engine().localEngine(), domain),
+				       rhs.engine().owningContext());
+      else if (rhs.engine().engineIsLocal())
+	SendReceive::send(ViewEngine_t(rhs.engine().localEngine(), domain),
+			  lhs.engine().owningContext());
+    } else {
+      if (lhs.engine().engineIsLocal())
+	GuardReceive::receive(lhs.engine().localEngine(), domain,
+			      rhs.engine().owningContext());
+      else if (rhs.engine().engineIsLocal())
+	GuardSend::send(rhs.engine().localEngine(), domain,
 			lhs.engine().owningContext());
+    }
   }
 }
 
+#endif
+
 template <int Dim, class T, class LayoutTag, class PatchTag>
 void Engine<Dim, T, MultiPatch<LayoutTag,PatchTag> >::
 fillGuardsHandler(const GuardLayers<Dim>& g, const WrappedInt<true> &) const
 {
-  if (!isDirty()) return;
+  if (!isDirty())
+    return;
 
-  int updated = 0;
   typename Layout_t::FillIterator_t p = layout_m.beginFillList();
+  GuardLayers<Dim> clean = pDirty_m->clean_m;
 
   while (p != layout_m.endFillList())
     {
-      int src  = p->ownedID_m;
-      int dest = p->guardID_m;
-      
-      // Skip face, if not dirty.
+      // Check how much of the guards (needed) we need where (dim, upper)
 
-      if (isDirty(p->face_m)) {
+      int dim = p->face_m/2;
+      bool upper = p->face_m&1;
+      int needed = upper ? g.upper(dim) : g.lower(dim);
 
-        // Check, if the p->domain_m is a guard which matches the
-        // needed guard g.
+      // Check against up-to-date status
 
-	int d = p->face_m/2;
-	int guardSizeNeeded = p->face_m & 1 ? g.upper(d) : g.lower(d);
-        if (!(p->face_m != -1
-	      && guardSizeNeeded == 0)) {
+      if (pDirty_m->upToDate(p->face_m) < needed) {
 
-          // Create patch arrays that see the entire patch:
+	// Create patch arrays that see the entire patch:
                   
-          Array<Dim, T, PatchTag> lhs(data()[dest]), rhs(data()[src]);
-      
-          // Now do assignment from the subdomains.
+	int src  = p->ownedID_m;
+	int dest = p->guardID_m;
+	Array<Dim, T, PatchTag> lhs(data()[dest]), rhs(data()[src]);
+
+	// Compute subdomain we need to assign by
+	// - shrinking the domain according to the needed guards
+	// - shrinking the domain according to the already up-to-date guards
+
+	// needed
+	Interval<Dim> domain = p->domain_m;
+	if (upper)
+	  domain[dim] = shrinkRight(p->domain_m[dim], p->domain_m[dim].size() - g.upper(dim));
+	else
+	  domain[dim] = shrinkLeft(p->domain_m[dim], p->domain_m[dim].size() - g.lower(dim));
+
+	// needed minus already up-to-date
+	Interval<Dim> domain2 = domain;
+	if (upper)
+	  domain2[dim] = shrinkLeft(domain[dim], pDirty_m->upToDate(p->face_m));
+	else
+	  domain2[dim] = shrinkRight(domain[dim], pDirty_m->upToDate(p->face_m));
+
+	// Now do the assignment
 #if POOMA_MPI
-          simpleAssign(lhs, rhs, p->domain_m);
+	simpleAssign(lhs, rhs, domain, domain2, true);
 #else
-          lhs(p->domain_m) = rhs(p->domain_m);
+	lhs(domain2) = rhs(domain2);
 #endif
 
-	  // Mark up-to-date.
-	  updated |= 1<<p->face_m;
-
-	}
+	// Record guard up-to-date-ness change, but defer real update
+	// because it would confuse further processing of the fill list.
+	if (upper)
+		clean.upper(dim) = needed;
+	else
+		clean.lower(dim) = needed;
 
       }
 
       ++p;
     }
 
-  *pDirty_m &= ~updated;
+  // Do the deferred update of the clean status.
+  pDirty_m->clean_m = clean;
+
+  // Check, if all internal guards are clean and update dirty flag accordingly.
+  if (pDirty_m->clean_m >= layout().internalGuards())
+    pDirty_m->dirty_m = false;
 }
 
 
@@ -377,7 +423,7 @@
       ++p;
     }
 
-  setDirty();
+  clearDirty();
 }
 
 
===== src/Layout/GuardLayers.h 1.5 vs edited =====
--- 1.5/r2/src/Layout/GuardLayers.h	2003-12-03 12:30:43 +01:00
+++ edited/src/Layout/GuardLayers.h	2004-07-22 17:27:26 +02:00
@@ -57,6 +57,7 @@
 class GuardLayers
 {
 public:
+  typedef int Element_t;
 
   //============================================================
   // Constructors
@@ -136,12 +137,12 @@
   // Mutators
   //============================================================
   
-  int &lower(int i) 
+  Element_t &lower(int i) 
   {    
     PInsist(i<Dim&&i>=0," GuardLayers index out of range ");
     return lower_m[i]; 
   }
-  int &upper(int i) 
+  Element_t &upper(int i) 
   {    
     PInsist(i<Dim&&i>=0," GuardLayers index out of range ");
     return upper_m[i]; 
@@ -162,6 +163,17 @@
     return result;
   }
   
+  bool operator>=(const GuardLayers<Dim> &gcs) const
+  {
+    for (int d = 0; d < Dim; ++d)
+      {
+	if (!(lower_m[d] >= gcs.lower_m[d]
+	      && upper_m[d] >= gcs.upper_m[d]))
+	  return false;
+      }
+    return true;
+  }
+  
   bool operator==(int gcw) const
   {
     bool result = true;
@@ -250,8 +262,8 @@
   
 private:
 
-  int lower_m[Dim];
-  int upper_m[Dim];
+  Element_t lower_m[Dim];
+  Element_t upper_m[Dim];
 };
 
 template<int Dim>
===== src/Threads/IterateSchedulers/SerialAsync.h 1.13 vs edited =====
--- 1.13/r2/src/Threads/IterateSchedulers/SerialAsync.h	2004-07-15 16:55:47 +02:00
+++ edited/src/Threads/IterateSchedulers/SerialAsync.h	2004-07-21 15:09:35 +02:00
@@ -263,7 +263,7 @@
       res = MPI_Testsome(last_used_request+1, requests_m,
 			 &nr_finished, finished, statuses);
     PAssert(res == MPI_SUCCESS || res == MPI_ERR_IN_STATUS);
-    if (nr_finished == MPI_UNDEFINED)
+    if (nr_finished == MPI_UNDEFINED || nr_finished == 0)
       return false;
 
     // release finised requests
@@ -311,9 +311,13 @@
   static bool runSomething(bool mayBlock = true)
   {
     // do work in this order to minimize communication latency:
+    // - process finished messages
     // - issue all messages
     // - do some regular work
     // - wait for messages to complete
+
+    if (waitForSomeRequests(false))
+      return true;
 
     RunnablePtr_t p = NULL;
     if (!workQueueMessages_m.empty()) {
===== src/Tulip/Messaging.cmpl.cpp 1.5 vs edited =====
--- 1.5/r2/src/Tulip/Messaging.cmpl.cpp	2004-01-07 12:18:08 +01:00
+++ edited/src/Tulip/Messaging.cmpl.cpp	2004-07-21 10:05:49 +02:00
@@ -48,6 +48,12 @@
 int  RemoteProxyBase::tag_m = 0;
 #endif
 
+#if POOMA_MPI
+typedef std::pair<int, Pooma::DataObject_t*> GIMKey_t;
+typedef std::map<GIMKey_t, Pooma::Iterate_t*> GIMap_t;
+GIMap_t guardSendIterateMap_g;
+GIMap_t guardReceiveIterateMap_g;
+#endif
 
 //-----------------------------------------------------------------------------
 // Tag generator creates a set of tags for global use in r2.  There is a
===== src/Tulip/SendReceive.h 1.12 vs edited =====
--- 1.12/r2/src/Tulip/SendReceive.h	2004-01-07 12:18:11 +01:00
+++ edited/src/Tulip/SendReceive.h	2004-07-23 13:50:24 +02:00
@@ -491,6 +491,262 @@
 };
 
 
+/// A map of <dest/source context, DataObject*> -> Iterate* relations.
+typedef std::pair<int, Pooma::DataObject_t*> GIMKey_t;
+typedef std::map<GIMKey_t, Pooma::Iterate_t*> GIMap_t;
+extern GIMap_t guardSendIterateMap_g;
+extern GIMap_t guardReceiveIterateMap_g;
+
+/** 
+ * A SendIterate requests a read lock on a piece of data.  When that read lock
+ * is granted, we call a cheetah matching handler to send the data to the
+ * appropriate context.  We construct the SendIterate with a tag that is used
+ * to match the appropriate ReceiveIterate on the remote context.
+ */
+
+template<class View>
+class GuardSendIterate
+  : public Pooma::Iterate_t
+{
+public:
+  GuardSendIterate(const View &view, const Interval<View::dimensions> &domain,
+		   int toContext, int tag)
+    : Pooma::Iterate_t(Pooma::scheduler()),
+      toContext_m(toContext),
+      tag_m(tag),
+      view_m(view),
+      domain_m(domain)
+  {
+    hintAffinity(engineFunctor(view_m,
+			       DataObjectRequest<BlockAffinity>()));
+
+#if POOMA_REORDER_ITERATES
+    // Priority interface was added to r2 version of serial async so that
+    // message send iterates would run before any other iterates.
+    priority(-1);
+#endif
+
+    DataObjectRequest<WriteRequest> writeReq(*this);
+    DataObjectRequest<ReadRequest> readReq(writeReq);
+    engineFunctor(view_m, readReq);
+  }
+
+  virtual void run()
+  {
+    // at this point we no longer can accept merging with other
+    // GuardSendIterates.
+    guardSendIterateMap_g.erase(GIMKey_t(toContext_m, view_m.dataObject()));
+
+    typedef typename NewEngine<View, Interval<View::dimensions> >::Type_t View_t;
+    typedef Cheetah::Serialize<Cheetah::CHEETAH, View_t> Serialize_t;
+
+    // take the view
+    View_t view(view_m, domain_m);
+
+    // serialize and send buffer
+    int length = Serialize_t::size(view);
+    buffer_m = new char[length];
+    Serialize_t::pack(view, buffer_m);
+    MPI_Request *request = Smarts::SystemContext::getMPIRequest(this);
+    int res = MPI_Isend(buffer_m, length, MPI_CHAR, toContext_m, tag_m,
+			MPI_COMM_WORLD, request);
+    PAssert(res == MPI_SUCCESS);
+
+    // release locks
+    DataObjectRequest<WriteRelease> writeReq;
+    DataObjectRequest<ReadRelease> readReq(writeReq);
+    engineFunctor(view_m, readReq);
+  }
+
+  virtual ~GuardSendIterate()
+  {
+    // cleanup temporary objects.
+    delete[] buffer_m;
+  }
+
+  void addDomain(const Interval<View::dimensions>& domain)
+  {
+    if (contains(domain, domain_m))
+      domain_m = domain;
+    else 
+      PAssert(contains(domain_m, domain));
+  }
+
+private:
+
+  // Context we're sending the data to.
+
+  int toContext_m;
+
+  // A tag used to match the sent data with the right receive.
+
+  int tag_m;
+
+  // Communication buffer.
+
+  char *buffer_m;
+
+  // The data we're sending, a local patch with the domain to send
+
+  View view_m;
+  Interval<View::dimensions> domain_m;
+};
+
+
+/**
+ * ReceiveIterate requests a write lock on a piece of data.  When that lock
+ * is granted, we register the data with the cheetah matching handler which
+ * will fill the block when a message arrives.  The write lock is released
+ * by the matching handler.
+ */
+
+template<class View>
+class GuardReceiveIterate
+  : public Pooma::Iterate_t
+{
+public:
+
+  typedef GuardReceiveIterate<View> This_t;
+
+  GuardReceiveIterate(const View &view, const Interval<View::dimensions> &domain,
+		      int fromContext, int tag)
+    : Pooma::Iterate_t(Pooma::scheduler()),
+      fromContext_m(fromContext),
+      tag_m(tag), buffer_m(NULL),
+      view_m(view),
+      domain_m(domain)
+  {
+    hintAffinity(engineFunctor(view,
+			       DataObjectRequest<BlockAffinity>()));
+
+#if POOMA_REORDER_ITERATES
+    // Priority interface was added to r2 version of serial async so that
+    // message receive iterates would run after any other iterates.
+    priority(-1);
+#endif
+
+    DataObjectRequest<WriteRequest> writeReq(*this);
+    engineFunctor(view, writeReq);
+
+    Pooma::addIncomingMessage();
+  }
+
+  virtual void run()
+  {
+    // at this point we can no longer accept merges with other
+    // GuardReceiveIterates.
+    guardReceiveIterateMap_g.erase(GIMKey_t(fromContext_m, view_m.dataObject()));
+
+    typedef typename NewEngine<View, Interval<View::dimensions> >::Type_t View_t;
+
+    // take the view - maybe can optimize this, because we need it only
+    // for size calculation
+    View_t view(view_m, domain_m);
+
+    int length = Cheetah::Serialize<Cheetah::CHEETAH, View_t>::size(view);
+    buffer_m = new char[length];
+    MPI_Request *request = Smarts::SystemContext::getMPIRequest(this);
+    int res = MPI_Irecv(buffer_m, length, MPI_CHAR, fromContext_m, tag_m,
+			MPI_COMM_WORLD, request);
+    PAssert(res == MPI_SUCCESS);
+  }
+
+  virtual ~GuardReceiveIterate()
+  {
+    typedef typename NewEngine<View, Interval<View::dimensions> >::Type_t View_t;
+    typedef Cheetah::Serialize<Cheetah::CHEETAH, View_t> Serialize_t;
+
+    // take the view
+    View_t view(view_m, domain_m);
+
+    // de-serialize into target view directly
+    Serialize_t::unpack(view, buffer_m);
+
+    // cleanup temporary objects
+    delete[] buffer_m;
+
+    // release locks
+    DataObjectRequest<WriteRelease> writeReq;
+    engineFunctor(view_m, writeReq);
+
+    Pooma::gotIncomingMessage();
+  }
+
+  void addDomain(const Interval<View::dimensions>& domain)
+  {
+    if (contains(domain, domain_m))
+      domain_m = domain;
+    else
+      PAssert(contains(domain_m, domain));
+  }
+
+private:
+
+  // Context we're sending the data to.
+
+  int fromContext_m;
+
+  // A tag used to match the sent data with the right send.
+
+  int tag_m;
+
+  // Communication buffer.
+
+  char *buffer_m;
+
+  // The place to put the data we're receiving, a local patch with
+  // the domain to receive to.
+
+  View view_m;
+  Interval<View::dimensions> domain_m;
+};
+
+/**
+ * SendReceive contains two static functions, send(view, context) and
+ * receive(view, context).  These functions encapsulate generating matching
+ * tags for the send and receive and launching the iterates to perform the
+ * send and receive.
+ */
+
+struct GuardSend
+{
+  template<class View>
+  static
+  void send(const View &view, const Interval<View::dimensions> &domain,
+	    int toContext)
+  {
+    PAssert(toContext >= 0 && toContext < Pooma::contexts());
+    Pooma::Iterate_t* &it = guardSendIterateMap_g[GIMKey_t(toContext, view.dataObject())];
+    if (!it) {
+      int tag = Pooma::sendTag(toContext);
+      it = new GuardSendIterate<View>(view, domain, toContext, tag);
+      Pooma::scheduler().handOff(it);
+    } else {
+      static_cast<GuardSendIterate<View>*>(it)->addDomain(domain);
+    }
+  }
+};
+
+struct GuardReceive
+{
+  template<class View>
+  static
+  void receive(const View &view, const Interval<View::dimensions> &domain,
+	       int fromContext)
+  {
+    PAssert(fromContext >= 0 && fromContext < Pooma::contexts());
+    Pooma::Iterate_t* &it = guardReceiveIterateMap_g[GIMKey_t(fromContext, view.dataObject())];
+    if (!it) {
+      int tag = Pooma::receiveTag(fromContext);
+      it = new GuardReceiveIterate<View>(view, domain, fromContext, tag);
+      Pooma::scheduler().handOff(it);
+    } else {
+      static_cast<GuardReceiveIterate<View>*>(it)->addDomain(domain);
+    }
+  }
+};
+
+
 #else // not POOMA_MESSAGING
 
 
--- /dev/null	Tue May 18 17:20:27 2004
+++ src/Array/tests/array_test30.cpp	Fri Jul 23 13:36:41 2004
@@ -0,0 +1,127 @@
+// -*- C++ -*-
+// ACL:license
+// ----------------------------------------------------------------------
+// This software and ancillary information (herein called "SOFTWARE")
+// called POOMA (Parallel Object-Oriented Methods and Applications) is
+// made available under the terms described here.  The SOFTWARE has been
+// approved for release with associated LA-CC Number LA-CC-98-65.
+// 
+// Unless otherwise indicated, this SOFTWARE has been authored by an
+// employee or employees of the University of California, operator of the
+// Los Alamos National Laboratory under Contract No. W-7405-ENG-36 with
+// the U.S. Department of Energy.  The U.S. Government has rights to use,
+// reproduce, and distribute this SOFTWARE. The public may copy, distribute,
+// prepare derivative works and publicly display this SOFTWARE without 
+// charge, provided that this Notice and any statement of authorship are 
+// reproduced on all copies.  Neither the Government nor the University 
+// makes any warranty, express or implied, or assumes any liability or 
+// responsibility for the use of this SOFTWARE.
+// 
+// If SOFTWARE is modified to produce derivative works, such modified
+// SOFTWARE should be clearly marked, so as not to confuse it with the
+// version available from LANL.
+// 
+// For more information about POOMA, send e-mail to pooma@xxxxxxxxxxxx,
+// or visit the POOMA web page at http://www.acl.lanl.gov/pooma/.
+// ----------------------------------------------------------------------
+// ACL:license
+
+//-----------------------------------------------------------------------------
+// array_test30: verify correctness of igc updates
+//-----------------------------------------------------------------------------
+
+// Include files
+
+#include "Pooma/Arrays.h"
+#include "Utilities/Tester.h"
+#include <iostream>
+
+
+template <class A1, class A2>
+bool test(Pooma::Tester& tester,
+	  const A1& a_mp, const A1& b_mp,
+	  const A2& a_sp, const A2& b_sp,
+	  const Loc<2>& delta1, const Loc<2>& delta2)
+{
+  static int sequence = 0;
+  Interval<2> I;
+
+  // initialize rhs arrays, ensure wrong igc values
+  // via sequence number.
+  I = b_sp.totalDomain();
+  b_sp(I) = sequence + iota(I).comp(0) + I[0].size()*iota(I).comp(1);
+  b_mp.engine().setGuards(0);
+  b_mp(I) = b_sp(I);
+
+  // do calculation both sp and mp
+  I = a_sp.physicalDomain();
+  a_sp(I) = b_sp(I+delta1) - b_sp(I+delta2);
+  a_mp(I) = b_mp(I+delta1) - b_mp(I+delta2);
+
+  // check the results are the same everywhere
+  bool res = all(a_sp(I) == a_mp(I));
+  tester.out() << "For deltas " << delta1
+	       << " and " << delta2 << " ";
+  tester.check("result is", res);
+  if (!res) {
+    int n = b_mp.layout().sizeGlobal();
+    for (int i=0; i<n; ++i) {
+      Array<2, int, Remote<Brick> > b(b_mp.engine().globalPatch(i));
+      tester.out() << "Brick " << i << " " << intersect(b.domain(), b_mp.physicalDomain())
+		   << " is\n" << b(intersect(b.totalDomain(), b_mp.physicalDomain()))
+		   << std::endl;
+    }
+    tester.out() << "Aborting." << std::endl;
+    return false;
+  }
+
+  sequence++;
+
+  return true;
+}
+
+
+int main(int argc, char *argv[])
+{
+  // Initialize POOMA and output stream, using Tester class
+  Pooma::initialize(argc, argv);
+  Pooma::Tester tester(argc, argv);
+
+  Interval<2> domain(12, 12);
+  UniformGridLayout<2> layout_mp(domain, Loc<2>(3, 3),
+				 GuardLayers<2>(2), DistributedTag());
+  DomainLayout<2> layout_sp(domain, GuardLayers<2>(2));
+
+  Array<2, int, MultiPatch<UniformTag, Remote<Brick> > >
+    a_mp(layout_mp), b_mp(layout_mp);
+  Array<2, int, Brick>
+    a_sp(layout_sp), b_sp(layout_sp);
+
+  // all 5^4 == 625 cases
+  for (int d1i = -2; d1i <= 2; ++d1i)
+    for (int d1j = -2; d1j <= 2; ++d1j)
+      for (int d2i = -2; d2i <= 2; ++d2i)
+	for (int d2j = -2; d2j <= 2; ++d2j)
+	  if (!test(tester, a_mp, b_mp, a_sp, b_sp,
+		    Loc<2>(d1i, d1j), Loc<2>(d2i, d2j)))
+	    goto out;
+ out:
+
+  // Expected results are
+  //   passes with 1, 2, 9
+  //   fails with 3, 4, 5, 6, 7, 8
+  // Which hints at problems with mixed local->local / local->remote igc updates
+  // for diagonal igc cells.
+  tester.out() << "Best testing is done with all 1 to 9 processes" << std::endl;
+
+  int retval = tester.results("array_test30");
+  Pooma::finalize();
+  return retval;
+}
+
+// ACL:rcsinfo
+// ----------------------------------------------------------------------
+// $RCSfile: array_test29.cpp,v $   $Author: pooma $
+// $Revision: 1.1 $   $Date: 2004/07/20 18:41:00 $
+// ----------------------------------------------------------------------
+// ACL:rcsinfo