Improve rsMatrix* documentation, fix bugs

Improves the user-facing documentation.  Fix the incorrect row & column
naming on the Get/Set API.  Fix a bug where rsMatrixLoadMultiply could
not have the destination be one of the source,
e.g. rsMatrixLoadMultiply(&l, &l, &r)

Change-Id: I42207aacf4ebe815d4a79db2aaa9c44f85864696
diff --git a/driver/runtime/rs_matrix.c b/driver/runtime/rs_matrix.c
index 052e829..064f233 100644
--- a/driver/runtime/rs_matrix.c
+++ b/driver/runtime/rs_matrix.c
@@ -170,33 +170,33 @@
 
 
 extern void __attribute__((overloadable))
-rsMatrixSet(rs_matrix4x4 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 4 + col] = v;
+rsMatrixSet(rs_matrix4x4 *m, uint32_t col, uint32_t row, float v) {
+    m->m[col * 4 + row] = v;
 }
 
 extern float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix4x4 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 4 + col];
+rsMatrixGet(const rs_matrix4x4 *m, uint32_t col, uint32_t row) {
+    return m->m[col * 4 + row];
 }
 
 extern void __attribute__((overloadable))
-rsMatrixSet(rs_matrix3x3 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 3 + col] = v;
+rsMatrixSet(rs_matrix3x3 *m, uint32_t col, uint32_t row, float v) {
+    m->m[col * 3 + row] = v;
 }
 
 extern float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix3x3 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 3 + col];
+rsMatrixGet(const rs_matrix3x3 *m, uint32_t col, uint32_t row) {
+    return m->m[col * 3 + row];
 }
 
 extern void __attribute__((overloadable))
-rsMatrixSet(rs_matrix2x2 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 2 + col] = v;
+rsMatrixSet(rs_matrix2x2 *m, uint32_t col, uint32_t row, float v) {
+    m->m[col * 2 + row] = v;
 }
 
 extern float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix2x2 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 2 + col];
+rsMatrixGet(const rs_matrix2x2 *m, uint32_t col, uint32_t row) {
+    return m->m[col * 2 + row];
 }
 
 extern float2 __attribute__((overloadable))
@@ -238,6 +238,9 @@
 
 extern void __attribute__((overloadable))
 rsMatrixLoadMultiply(rs_matrix4x4 *ret, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs) {
+    // Use a temporary variable to support the case where one of the inputs
+    // is also the destination, e.g. rsMatrixLoadMultiply(&left, &left, &right);
+    rs_matrix4x4 result;
     for (int i=0 ; i<4 ; i++) {
         float ri0 = 0;
         float ri1 = 0;
@@ -250,22 +253,24 @@
             ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
             ri3 += rsMatrixGet(lhs, j, 3) * rhs_ij;
         }
-        rsMatrixSet(ret, i, 0, ri0);
-        rsMatrixSet(ret, i, 1, ri1);
-        rsMatrixSet(ret, i, 2, ri2);
-        rsMatrixSet(ret, i, 3, ri3);
+        rsMatrixSet(&result, i, 0, ri0);
+        rsMatrixSet(&result, i, 1, ri1);
+        rsMatrixSet(&result, i, 2, ri2);
+        rsMatrixSet(&result, i, 3, ri3);
     }
+    rsMatrixLoad(ret, &result);
 }
 
 extern void __attribute__((overloadable))
 rsMatrixMultiply(rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs) {
-    rs_matrix4x4 r;
-    rsMatrixLoadMultiply(&r, lhs, rhs);
-    rsMatrixLoad(lhs, &r);
+    rsMatrixLoadMultiply(lhs, lhs, rhs);
 }
 
 extern void __attribute__((overloadable))
 rsMatrixLoadMultiply(rs_matrix3x3 *ret, const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs) {
+    // Use a temporary variable to support the case where one of the inputs
+    // is also the destination, e.g. rsMatrixLoadMultiply(&left, &left, &right);
+    rs_matrix3x3 result;
     for (int i=0 ; i<3 ; i++) {
         float ri0 = 0;
         float ri1 = 0;
@@ -276,21 +281,23 @@
             ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
             ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
         }
-        rsMatrixSet(ret, i, 0, ri0);
-        rsMatrixSet(ret, i, 1, ri1);
-        rsMatrixSet(ret, i, 2, ri2);
+        rsMatrixSet(&result, i, 0, ri0);
+        rsMatrixSet(&result, i, 1, ri1);
+        rsMatrixSet(&result, i, 2, ri2);
     }
+    rsMatrixLoad(ret, &result);
 }
 
 extern void __attribute__((overloadable))
 rsMatrixMultiply(rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs) {
-    rs_matrix3x3 r;
-    rsMatrixLoadMultiply(&r, lhs, rhs);
-    rsMatrixLoad(lhs, &r);
+    rsMatrixLoadMultiply(lhs, lhs, rhs);
 }
 
 extern void __attribute__((overloadable))
 rsMatrixLoadMultiply(rs_matrix2x2 *ret, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs) {
+    // Use a temporary variable to support the case where one of the inputs
+    // is also the destination, e.g. rsMatrixLoadMultiply(&left, &left, &right);
+    rs_matrix2x2 result;
     for (int i=0 ; i<2 ; i++) {
         float ri0 = 0;
         float ri1 = 0;
@@ -299,15 +306,14 @@
             ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
             ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
         }
-        rsMatrixSet(ret, i, 0, ri0);
-        rsMatrixSet(ret, i, 1, ri1);
+        rsMatrixSet(&result, i, 0, ri0);
+        rsMatrixSet(&result, i, 1, ri1);
     }
+    rsMatrixLoad(ret, &result);
 }
 
 extern void __attribute__((overloadable))
 rsMatrixMultiply(rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs) {
-    rs_matrix2x2 r;
-    rsMatrixLoadMultiply(&r, lhs, rhs);
-    rsMatrixLoad(lhs, &r);
+    rsMatrixLoadMultiply(lhs, lhs, rhs);
 }
 
diff --git a/rsMatrix2x2.cpp b/rsMatrix2x2.cpp
index 622113c..91accbe 100644
--- a/rsMatrix2x2.cpp
+++ b/rsMatrix2x2.cpp
@@ -42,6 +42,9 @@
 }
 
 void Matrix2x2::loadMultiply(const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs) {
+    // Use a temporary variable to support the case where one of the inputs
+    // is also the destination, e.g. left.loadMultiply(left, right);
+    Matrix2x2 temp;
     for (int i=0 ; i<2 ; i++) {
         float ri0 = 0;
         float ri1 = 0;
@@ -50,9 +53,10 @@
             ri0 += ((const Matrix2x2 *)lhs)->get(j, 0) * rhs_ij;
             ri1 += ((const Matrix2x2 *)lhs)->get(j, 1) * rhs_ij;
         }
-        set(i, 0, ri0);
-        set(i, 1, ri1);
+        temp.set(i, 0, ri0);
+        temp.set(i, 1, ri1);
     }
+    load(&temp);
 }
 
 void Matrix2x2::transpose() {
diff --git a/rsMatrix2x2.h b/rsMatrix2x2.h
index 4fbd1c2..f908c0c 100644
--- a/rsMatrix2x2.h
+++ b/rsMatrix2x2.h
@@ -25,12 +25,12 @@
 namespace renderscript {
 
 struct Matrix2x2 : public rs_matrix2x2 {
-    inline float get(uint32_t x, uint32_t y) const {
-        return m[x*2 + y];
+    inline float get(uint32_t col, uint32_t row) const {
+        return m[col*2 + row];
     }
 
-    inline void set(uint32_t x, uint32_t y, float v) {
-        m[x*2 + y] = v;
+    inline void set(uint32_t col, uint32_t row, float v) {
+        m[col*2 + row] = v;
     }
 
     void loadIdentity();
@@ -42,9 +42,7 @@
     void transpose();
 
     void multiply(const rs_matrix2x2 *rhs) {
-        Matrix2x2 tmp;
-        tmp.loadMultiply(this, rhs);
-        load(&tmp);
+        loadMultiply(this, rhs);
     }
 };
 
diff --git a/rsMatrix3x3.cpp b/rsMatrix3x3.cpp
index 3f9a2d1..4f27fcc 100644
--- a/rsMatrix3x3.cpp
+++ b/rsMatrix3x3.cpp
@@ -46,6 +46,9 @@
 }
 
 void Matrix3x3::loadMultiply(const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs) {
+    // Use a temporary variable to support the case where one of the inputs
+    // is also the destination, e.g. left.loadMultiply(left, right);
+    Matrix3x3 temp;
     for (int i=0 ; i<3 ; i++) {
         float ri0 = 0;
         float ri1 = 0;
@@ -56,10 +59,11 @@
             ri1 += ((const Matrix3x3 *)lhs)->get(j, 1) * rhs_ij;
             ri2 += ((const Matrix3x3 *)lhs)->get(j, 2) * rhs_ij;
         }
-        set(i, 0, ri0);
-        set(i, 1, ri1);
-        set(i, 2, ri2);
+        temp.set(i, 0, ri0);
+        temp.set(i, 1, ri1);
+        temp.set(i, 2, ri2);
     }
+    load(&temp);
 }
 
 void Matrix3x3::transpose() {
diff --git a/rsMatrix3x3.h b/rsMatrix3x3.h
index 05249b1..5c10846 100644
--- a/rsMatrix3x3.h
+++ b/rsMatrix3x3.h
@@ -25,12 +25,12 @@
 namespace renderscript {
 
 struct Matrix3x3 : public rs_matrix3x3 {
-    inline float get(uint32_t x, uint32_t y) const {
-        return m[x*3 + y];
+    inline float get(uint32_t col, uint32_t row) const {
+        return m[col*3 + row];
     }
 
-    inline void set(uint32_t x, uint32_t y, float v) {
-        m[x*3 + y] = v;
+    inline void set(uint32_t col, uint32_t row, float v) {
+        m[col*3 + row] = v;
     }
 
     void loadIdentity();
@@ -42,9 +42,7 @@
     void transpose();
 
     void multiply(const rs_matrix3x3 *rhs) {
-        Matrix3x3 tmp;
-        tmp.loadMultiply(this, rhs);
-        load(&tmp);
+        loadMultiply(this, rhs);
     }
 };
 
diff --git a/rsMatrix4x4.cpp b/rsMatrix4x4.cpp
index c6f96d8..f166c57 100644
--- a/rsMatrix4x4.cpp
+++ b/rsMatrix4x4.cpp
@@ -250,6 +250,9 @@
 }
 
 void Matrix4x4::loadMultiply(const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs) {
+    // Use a temporary variable to support the case where one of the inputs
+    // is also the destination, e.g. left.loadMultiply(left, right);
+    Matrix4x4 temp;
     for (int i=0 ; i<4 ; i++) {
         float ri0 = 0;
         float ri1 = 0;
@@ -262,11 +265,12 @@
             ri2 += ((const Matrix4x4 *)lhs)->get(j,2) * rhs_ij;
             ri3 += ((const Matrix4x4 *)lhs)->get(j,3) * rhs_ij;
         }
-        set(i,0, ri0);
-        set(i,1, ri1);
-        set(i,2, ri2);
-        set(i,3, ri3);
+        temp.set(i,0, ri0);
+        temp.set(i,1, ri1);
+        temp.set(i,2, ri2);
+        temp.set(i,3, ri3);
     }
+    load(&temp);
 }
 
 void Matrix4x4::loadOrtho(float left, float right, float bottom, float top, float near, float far) {
@@ -299,6 +303,7 @@
     loadFrustum(left, right, bottom, top, near, far);
 }
 
+// Note: This assumes that the input vector (in) is of length 3.
 void Matrix4x4::vectorMultiply(float *out, const float *in) const {
     out[0] = (m[0] * in[0]) + (m[4] * in[1]) + (m[8] * in[2]) + m[12];
     out[1] = (m[1] * in[0]) + (m[5] * in[1]) + (m[9] * in[2]) + m[13];
diff --git a/rsMatrix4x4.h b/rsMatrix4x4.h
index 44c33d1..8e4a586 100644
--- a/rsMatrix4x4.h
+++ b/rsMatrix4x4.h
@@ -25,12 +25,12 @@
 namespace renderscript {
 
 struct Matrix4x4 : public rs_matrix4x4 {
-    float get(uint32_t x, uint32_t y) const {
-        return m[x*4 + y];
+    float get(uint32_t col, uint32_t row) const {
+        return m[col*4 + row];
     }
 
-    void set(uint32_t x, uint32_t y, float v) {
-        m[x*4 + y] = v;
+    void set(uint32_t col, uint32_t row, float v) {
+        m[col*4 + row] = v;
     }
 
     void loadIdentity();
@@ -48,6 +48,7 @@
     void loadFrustum(float l, float r, float b, float t, float n, float f);
     void loadPerspective(float fovy, float aspect, float near, float far);
 
+    // Note: This assumes that the input vector (in) is of length 3.
     void vectorMultiply(float *v4out, const float *v3in) const;
 
     bool inverse();
@@ -58,9 +59,7 @@
 
 
     void multiply(const rs_matrix4x4 *rhs) {
-        Matrix4x4 tmp;
-        tmp.loadMultiply(this, rhs);
-        load(&tmp);
+        loadMultiply(this, rhs);
     }
     void rotate(float rot, float x, float y, float z) {
         Matrix4x4 tmp;
diff --git a/scriptc/rs_matrix.rsh b/scriptc/rs_matrix.rsh
index ebff7f4..34b9532 100644
--- a/scriptc/rs_matrix.rsh
+++ b/scriptc/rs_matrix.rsh
@@ -15,8 +15,35 @@
  */
 
 /** @file rs_matrix.rsh
- *  \brief Matrix routines
+ *  \brief Matrix functions.
  *
+ * These functions let you manipulate square matrices of rank 2x2, 3x3, and 4x4.
+ * They are particularly useful for graphical transformations and are
+ * compatible with OpenGL.
+ *
+ * A few general notes:
+ *
+ * \li We use a zero-based index for rows and columns.  E.g. the last element of
+ * a \ref rs_matrix4x4 is found at (3, 3).
+ *
+ * \li RenderScript uses column-based vectors.  Transforming a vector is done by
+ * postmultiplying the vector, e.g. <em>(matrix * vector)</em>, as provided by
+ * \ref rsMatrixMultiply.
+ *
+ * \li To create a transformation matrix that performs two transformations at
+ * once, multiply the two source matrices, with the first transformation as the
+ * right argument.  E.g. to create a transformation matrix that applies the
+ * transformation \e s1 followed by \e s2, call
+ * </c>rsMatrixLoadMultiply(&combined, &s2, &s1)</c>.
+ * This derives from <em>s2 * (s1 * v)</em>, which is <em>(s2 * s1) * v</em>.
+ *
+ * \li We have two style of functions to create transformation matrices:
+ * rsMatrixLoad<em>Transformation</em> and rsMatrix<em>Transformation</em>.  The
+ * former style simply stores the transformation matrix in the first argument.
+ * The latter modifies a pre-existing transformation matrix so that the new
+ * transformation happens first.  E.g. if you call \ref rsMatrixTranslate
+ * on a matrix that already does a scaling, the resulting matrix when applied
+ * to a vector will first do the translation then the scaling.
  *
  */
 
@@ -24,54 +51,60 @@
 #define __RS_MATRIX_RSH__
 
 /**
- * Set one element of a matrix.
+ * Set an element of a matrix.
  *
- * @param m The matrix to be set
- * @param row
- * @param col
- * @param v
+ * @param m The matrix that will be modified.
+ * @param col The zero-based column of the element to be set.
+ * @param row The zero-based row of the element to be set.
+ * @param v The value to set.
+ *
+ * \warning The order of the column and row parameters may be
+ * unexpected.
  *
  * @return void
  */
 _RS_RUNTIME void __attribute__((overloadable))
-rsMatrixSet(rs_matrix4x4 *m, uint32_t row, uint32_t col, float v);
+rsMatrixSet(rs_matrix4x4 *m, uint32_t col, uint32_t row, float v);
 /**
  * \overload
  */
 _RS_RUNTIME void __attribute__((overloadable))
-rsMatrixSet(rs_matrix3x3 *m, uint32_t row, uint32_t col, float v);
+rsMatrixSet(rs_matrix3x3 *m, uint32_t col, uint32_t row, float v);
 /**
  * \overload
  */
 _RS_RUNTIME void __attribute__((overloadable))
-rsMatrixSet(rs_matrix2x2 *m, uint32_t row, uint32_t col, float v);
+rsMatrixSet(rs_matrix2x2 *m, uint32_t col, uint32_t row, float v);
 
 /**
- * Get one element of a matrix.
+ * Returns one element of a matrix.
  *
- * @param m The matrix to read from
- * @param row
- * @param col
+ * @param m The matrix to extract the element from.
+ * @param col The zero-based column of the element to be extracted.
+ * @param row The zero-based row of the element to extracted.
+ *
+ * \warning The order of the column and row parameters may be
+ * unexpected.
  *
  * @return float
  */
 _RS_RUNTIME float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix4x4 *m, uint32_t row, uint32_t col);
+rsMatrixGet(const rs_matrix4x4 *m, uint32_t col, uint32_t row);
 /**
  * \overload
  */
 _RS_RUNTIME float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix3x3 *m, uint32_t row, uint32_t col);
+rsMatrixGet(const rs_matrix3x3 *m, uint32_t col, uint32_t row);
 /**
  * \overload
  */
 _RS_RUNTIME float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix2x2 *m, uint32_t row, uint32_t col);
+rsMatrixGet(const rs_matrix2x2 *m, uint32_t col, uint32_t row);
 
 /**
  * Set the elements of a matrix to the identity matrix.
  *
- * @param m
+ * @param m The matrix to set.
  */
 extern void __attribute__((overloadable)) rsMatrixLoadIdentity(rs_matrix4x4 *m);
 /**
@@ -86,7 +119,13 @@
 /**
  * Set the elements of a matrix from an array of floats.
  *
- * @param m
+ * The array of floats should be in row-major order, i.e. the element a
+ * <em>row 0, column 0</em> should be first, followed by the element at
+ * <em>row 0, column 1</em>, etc.
+ *
+ * @param m The matrix to set.
+ * @param v The array of values to set the matrix to. These arrays should be
+ * 4, 9, or 16 floats long, depending on the matrix size.
  */
 extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const float *v);
 /**
@@ -98,18 +137,29 @@
  */
 extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix2x2 *m, const float *v);
 /**
- * \overload
+ * Set the elements of a matrix from another matrix.
+ *
+ * If the source matrix is smaller than the destination, the rest of the
+ * destination is filled with elements of the identity matrix.  E.g.
+ * loading a rs_matrix2x2 into a rs_matrix4x4 will give:
+ *
+ * \htmlonly<table>
+ * <tr><td>m00</td><td>m01</td><td>0.0</td><td>0.0</td></tr>
+ * <tr><td>m10</td><td>m11</td><td>0.0</td><td>0.0</td></tr>
+ * <tr><td>0.0</td><td>0.0</td><td>1.0</td><td>0.0</td></tr>
+ * <tr><td>0.0</td><td>0.0</td><td>0.0</td><td>1.0</td></tr>
+ * </table>\endhtmlonly
+ *
+ * @param m The matrix to set.
+ * @param v The source matrix.
  */
 extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix4x4 *v);
 /**
  * \overload
  */
 extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix3x3 *v);
-
 /**
- * Set the elements of a matrix from another matrix.
- *
- * @param m
+ * \overload
  */
 extern void __attribute__((overloadable)) rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix2x2 *v);
 /**
@@ -124,11 +174,19 @@
 /**
  * Load a rotation matrix.
  *
- * @param m
- * @param rot
- * @param x
- * @param y
- * @param z
+ * This function creates a rotation matrix.  The axis of rotation is the
+ * <em>(x, y, z)</em> vector.
+ *
+ * To rotate a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * See http://en.wikipedia.org/wiki/Rotation_matrix .
+ *
+ * @param m The matrix to set.
+ * @param rot How much rotation to do, in degrees.
+ * @param x The x component of the vector that is the axis of rotation.
+ * @param y The y component of the vector that is the axis of rotation.
+ * @param z The z component of the vector that is the axis of rotation.
  */
 extern void __attribute__((overloadable))
 rsMatrixLoadRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
@@ -136,10 +194,16 @@
 /**
  * Load a scale matrix.
  *
- * @param m
- * @param x
- * @param y
- * @param z
+ * This function creates a scaling matrix, where each component of a
+ * vector is multiplied by a number.  This number can be negative.
+ *
+ * To scale a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param x The multiple to scale the x components by.
+ * @param y The multiple to scale the y components by.
+ * @param z The multiple to scale the z components by.
  */
 extern void __attribute__((overloadable))
 rsMatrixLoadScale(rs_matrix4x4 *m, float x, float y, float z);
@@ -147,20 +211,38 @@
 /**
  * Load a translation matrix.
  *
- * @param m
- * @param x
- * @param y
- * @param z
+ * This function creates a translation matrix, where a
+ * number is added to each element of a vector.
+ *
+ * To translate a vector, multiply the vector by the created matrix
+ * using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
+ * @param x The number to add to each x component.
+ * @param y The number to add to each y component.
+ * @param z The number to add to each z component.
  */
 extern void __attribute__((overloadable))
 rsMatrixLoadTranslate(rs_matrix4x4 *m, float x, float y, float z);
 
 /**
- * Multiply two matrix (lhs, rhs) and place the result in m.
+ * Multiply two matrices.
  *
- * @param m
- * @param lhs
- * @param rhs
+ * Sets \e m to the matrix product of <em>lhs * rhs</em>.
+ *
+ * To combine two 4x4 transformaton matrices, multiply the second transformation matrix
+ * by the first transformation matrix.  E.g. to create a transformation matrix that applies
+ * the transformation \e s1 followed by \e s2, call
+ * </c>rsMatrixLoadMultiply(&combined, &s2, &s1)</c>.
+ *
+ * \warning Prior to version 21, storing the result back into right matrix is not supported and
+ * will result in undefined behavior.  Use rsMatrixMulitply instead.   E.g. instead of doing
+ * rsMatrixLoadMultiply (&m2r, &m2r, &m2l), use rsMatrixMultiply (&m2r, &m2l).
+ * rsMatrixLoadMultiply (&m2l, &m2r, &m2l) works as expected.
+ *
+ * @param m The matrix to set.
+ * @param lhs The left matrix of the product.
+ * @param rhs The right matrix of the product.
  */
 extern void __attribute__((overloadable))
 rsMatrixLoadMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs);
@@ -176,10 +258,16 @@
 rsMatrixLoadMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs);
 
 /**
- * Multiply the matrix m by rhs and place the result back into m.
+ * Multiply a matrix into another one.
  *
- * @param m (lhs)
- * @param rhs
+ * Sets \e m to the matrix product <em>m * rhs</em>.
+ *
+ * When combining two 4x4 transformation matrices using this function, the resulting
+ * matrix will correspond to performing the \e rhs transformation first followed by
+ * the original \e m transformation.
+ *
+ * @param m The left matrix of the product and the matrix to be set.
+ * @param rhs The right matrix of the product.
  */
 extern void __attribute__((overloadable))
 rsMatrixMultiply(rs_matrix4x4 *m, const rs_matrix4x4 *rhs);
@@ -195,43 +283,73 @@
 rsMatrixMultiply(rs_matrix2x2 *m, const rs_matrix2x2 *rhs);
 
 /**
- * Multiple matrix m with a rotation matrix
+ * Multiply the matrix \e m with a rotation matrix.
  *
- * @param m
- * @param rot
- * @param x
- * @param y
- * @param z
+ * This function modifies a transformation matrix to first do a rotation.
+ * The axis of rotation is the <em>(x, y, z)</em> vector.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param rot How much rotation to do, in degrees.
+ * @param x The x component of the vector that is the axis of rotation.
+ * @param y The y component of the vector that is the axis of rotation.
+ * @param z The z component of the vector that is the axis of rotation.
  */
 extern void __attribute__((overloadable))
 rsMatrixRotate(rs_matrix4x4 *m, float rot, float x, float y, float z);
 
 /**
- * Multiple matrix m with a scale matrix
+ * Multiply the matrix \e m with a scaling matrix.
  *
- * @param m
- * @param x
- * @param y
- * @param z
+ * This function modifies a transformation matrix to first do a scaling.
+ * When scaling, each component of a vector is multiplied by a number.
+ * This number can be negative.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param x The multiple to scale the x components by.
+ * @param y The multiple to scale the y components by.
+ * @param z The multiple to scale the z components by.
  */
 extern void __attribute__((overloadable))
 rsMatrixScale(rs_matrix4x4 *m, float x, float y, float z);
 
 /**
- * Multiple matrix m with a translation matrix
+ * Multiply the matrix \e m with a translation matrix.
  *
- * @param m
- * @param x
- * @param y
- * @param z
+ * This function modifies a transformation matrix to first
+ * do a translation.  When translating, a number is added
+ * to each component of a vector.
+ *
+ * To apply this combined transformation to a vector, multiply
+ * the vector by the created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to modify.
+ * @param x The number to add to each x component.
+ * @param y The number to add to each y component.
+ * @param z The number to add to each z component.
  */
 extern void __attribute__((overloadable))
 rsMatrixTranslate(rs_matrix4x4 *m, float x, float y, float z);
 
 /**
- * Load an Ortho projection matrix constructed from the 6 planes
+ * Load an orthographic projection matrix.
  *
- * @param m
+ * Constructs an orthographic projection matrix, transforming the box
+ * identified by the six clipping planes <em>left, right, bottom, top,
+ * near, far</em> into a unit cube with a corner at
+ * <em>(-1, -1, -1)</em> and the opposite at <em>(1, 1, 1)</em>.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * See https://en.wikipedia.org/wiki/Orthographic_projection .
+ *
+ * @param m The matrix to set.
  * @param left
  * @param right
  * @param bottom
@@ -243,9 +361,16 @@
 rsMatrixLoadOrtho(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
 
 /**
- * Load an Frustum projection matrix constructed from the 6 planes
+ * Load a frustum projection matrix.
  *
- * @param m
+ * Constructs a frustum projection matrix, transforming the box
+ * identified by the six clipping planes <em>left, right, bottom, top,
+ * near, far</em>.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
  * @param left
  * @param right
  * @param bottom
@@ -257,21 +382,36 @@
 rsMatrixLoadFrustum(rs_matrix4x4 *m, float left, float right, float bottom, float top, float near, float far);
 
 /**
- * Load an perspective projection matrix constructed from the 6 planes
+ * Load a perspective projection matrix.
  *
- * @param m
+ * Constructs a perspective projection matrix, assuming a symmetrical field of view.
+ *
+ * To apply this projection to a vector, multiply the vector by the
+ * created matrix using \ref rsMatrixMultiply.
+ *
+ * @param m The matrix to set.
  * @param fovy Field of view, in degrees along the Y axis.
  * @param aspect Ratio of x / y.
- * @param near
- * @param far
+ * @param near The near clipping plane.
+ * @param far The far clipping plane.
  */
 extern void __attribute__((overloadable))
 rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
 
 #if !defined(RS_VERSION) || (RS_VERSION < 14)
 /**
- * Multiply a vector by a matrix and return the result vector.
- * API version 10-13
+ * Multiply a vector by a matrix.
+ *
+ * Returns the post-multiplication of the vector by the matrix, ie. <em>m * in</em>.
+ *
+ * When multiplying a \e float3 to a \e rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix3x3, the vector is expanded with (0).
+ *
+ * This function is available in API version 10-13.  Starting with API 14,
+ * the function takes a const matrix as the first argument.
  */
 _RS_RUNTIME float4 __attribute__((overloadable))
 rsMatrixMultiply(rs_matrix4x4 *m, float4 in);
@@ -307,8 +447,17 @@
 rsMatrixMultiply(rs_matrix2x2 *m, float2 in);
 #else
 /**
- * Multiply a vector by a matrix and return the result vector.
- * API version 14+
+ * Multiply a vector by a matrix.
+ *
+ * Returns the post-multiplication of the vector of the matrix, i.e. <em>m * in</em>.
+ *
+ * When multiplying a \e float3 to a \e rs_matrix4x4, the vector is expanded with (1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix4x4, the vector is expanded with (0, 1).
+ *
+ * When multiplying a \e float2 to a \e rs_matrix3x3, the vector is expanded with (0).
+ *
+ * This function is available starting with API version 14.
  */
 _RS_RUNTIME float4 __attribute__((overloadable))
 rsMatrixMultiply(const rs_matrix4x4 *m, float4 in);
@@ -346,23 +495,28 @@
 
 
 /**
- * Returns true if the matrix was successfully inversed
+ * Inverts a matrix in place.
  *
- * @param m
+ * Returns true if the matrix was successfully inverted.
+ *
+ * @param m The matrix to invert.
  */
 extern bool __attribute__((overloadable)) rsMatrixInverse(rs_matrix4x4 *m);
 
 /**
- * Returns true if the matrix was successfully inversed and transposed.
+ * Inverts and transpose a matrix in place.
  *
- * @param m
+ * The matrix is first inverted then transposed.
+ * Returns true if the matrix was successfully inverted.
+ *
+ * @param m The matrix to modify.
  */
 extern bool __attribute__((overloadable)) rsMatrixInverseTranspose(rs_matrix4x4 *m);
 
 /**
- * Transpose the matrix m.
+ * Transpose the matrix m in place.
  *
- * @param m
+ * @param m The matrix to transpose.
  */
 extern void __attribute__((overloadable)) rsMatrixTranspose(rs_matrix4x4 *m);
 /**