Improve rsMatrix* documentation, fix bugs

Improves the user-facing documentation.  Fix the incorrect row & column
naming on the Get/Set API.  Fix a bug where rsMatrixLoadMultiply could
not have the destination be one of the source,
e.g. rsMatrixLoadMultiply(&l, &l, &r)

Change-Id: I42207aacf4ebe815d4a79db2aaa9c44f85864696
diff --git a/rsMatrix4x4.h b/rsMatrix4x4.h
index 44c33d1..8e4a586 100644
--- a/rsMatrix4x4.h
+++ b/rsMatrix4x4.h
@@ -25,12 +25,12 @@
 namespace renderscript {
 
 struct Matrix4x4 : public rs_matrix4x4 {
-    float get(uint32_t x, uint32_t y) const {
-        return m[x*4 + y];
+    float get(uint32_t col, uint32_t row) const {
+        return m[col*4 + row];
     }
 
-    void set(uint32_t x, uint32_t y, float v) {
-        m[x*4 + y] = v;
+    void set(uint32_t col, uint32_t row, float v) {
+        m[col*4 + row] = v;
     }
 
     void loadIdentity();
@@ -48,6 +48,7 @@
     void loadFrustum(float l, float r, float b, float t, float n, float f);
     void loadPerspective(float fovy, float aspect, float near, float far);
 
+    // Note: This assumes that the input vector (in) is of length 3.
     void vectorMultiply(float *v4out, const float *v3in) const;
 
     bool inverse();
@@ -58,9 +59,7 @@
 
 
     void multiply(const rs_matrix4x4 *rhs) {
-        Matrix4x4 tmp;
-        tmp.loadMultiply(this, rhs);
-        load(&tmp);
+        loadMultiply(this, rhs);
     }
     void rotate(float rot, float x, float y, float z) {
         Matrix4x4 tmp;