Merge "X86: Add missing functions for libclcore files"
diff --git a/api/GenerateHeaderFiles.cpp b/api/GenerateHeaderFiles.cpp
index aac7ecb..9d5b49a 100644
--- a/api/GenerateHeaderFiles.cpp
+++ b/api/GenerateHeaderFiles.cpp
@@ -42,7 +42,7 @@
  * add a check on a flag that can be set for internal builds.  This enables us to keep supporting
  * old APIs in the runtime code.
  */
-static void writeVersionGuardStart(GeneratedFile* file, VersionInfo info, int finalVersion) {
+static void writeVersionGuardStart(GeneratedFile* file, VersionInfo info, unsigned int finalVersion) {
     if (info.intSize == 32) {
         *file << "#ifndef __LP64__\n";
     } else if (info.intSize == 64) {
@@ -218,7 +218,7 @@
         *file << "void";
     }
 
-    *file << makeAttributeTag(spec.getAttribute(), "overloadable",
+    *file << makeAttributeTag(spec.getAttribute(), spec.isOverloadable() ? "overloadable" : "",
                               function->getDeprecatedApiLevel(), function->getDeprecatedMessage());
     *file << "\n";
 
@@ -364,6 +364,10 @@
 
     set<Function*> documentedFunctions;
     for (auto spec : specFile.getFunctionSpecifications()) {
+        // Do not include internal APIs in the header files.
+        if (spec->isInternal()) {
+            continue;
+        }
         Function* function = spec->getFunction();
         if (documentedFunctions.find(function) == documentedFunctions.end()) {
             documentedFunctions.insert(function);
diff --git a/api/GenerateStubsWhiteList.cpp b/api/GenerateStubsWhiteList.cpp
index 9b4297d..69afdbf 100644
--- a/api/GenerateStubsWhiteList.cpp
+++ b/api/GenerateStubsWhiteList.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <algorithm>
+#include <climits>
 #include <iostream>
 #include <iterator>
 #include <sstream>
@@ -25,8 +26,8 @@
 
 using namespace std;
 
-const int kMinimumApiLevelForTests = 11;
-const int kApiLevelWithFirst64Bit = 21;
+const unsigned int kMinimumApiLevelForTests = 11;
+const unsigned int kApiLevelWithFirst64Bit = 21;
 
 // Used to map the built-in types to their mangled representations
 struct BuiltInMangling {
@@ -58,7 +59,7 @@
  * substitution for the provided type name, as would be done (mostly) by a
  * preprocessor.  Returns empty string if there's no substitution.
  */
-static string findSubstitute(const string& typeName, int apiLevel, int intSize) {
+static string findSubstitute(const string& typeName, unsigned int apiLevel, int intSize) {
     const auto& types = systemSpecification.getTypes();
     const auto type = types.find(typeName);
     if (type != types.end()) {
@@ -92,7 +93,7 @@
  * the resulting list.  'apiLevel' and 'intSize' specifies the API level and bitness
  * we are currently processing.
  */
-list<string> expandTypedefs(const string type, int apiLevel, int intSize) {
+list<string> expandTypedefs(const string type, unsigned int apiLevel, int intSize) {
     // Split the string in tokens.
     istringstream stream(type);
     list<string> tokens{istream_iterator<string>{stream}, istream_iterator<string>{}};
@@ -268,7 +269,7 @@
 
 // Write to the stream the mangled representation of each parameter.
 static bool writeParameters(ostringstream* stream, const std::vector<ParameterDefinition*>& params,
-                            int apiLevel, int intSize) {
+                            unsigned int apiLevel, int intSize) {
     if (params.empty()) {
         *stream << "v";
         return true;
@@ -298,7 +299,7 @@
  */
 static bool addFunctionManglingToSet(const Function& function,
                                      const FunctionPermutation& permutation, bool overloadable,
-                                     int apiLevel, int intSize, set<string>* allManglings) {
+                                     unsigned int apiLevel, int intSize, set<string>* allManglings) {
     const string& functionName = permutation.getName();
     string mangling;
     if (overloadable) {
@@ -322,22 +323,25 @@
  * of API levels covered.
  */
 static bool addManglingsForSpecification(const Function& function,
-                                         const FunctionSpecification& spec, int lastApiLevel,
+                                         const FunctionSpecification& spec, unsigned int lastApiLevel,
                                          set<string>* allManglings) {
     // If the function is inlined, we won't generate an unresolved external for that.
     if (spec.hasInline()) {
         return true;
     }
     const VersionInfo info = spec.getVersionInfo();
-    const int minApiLevel = info.minVersion ? info.minVersion : kMinimumApiLevelForTests;
-    const int maxApiLevel = info.maxVersion ? info.maxVersion : lastApiLevel;
+    unsigned int minApiLevel, maxApiLevel;
+    minApiLevel = info.minVersion ? info.minVersion : kMinimumApiLevelForTests;
+    maxApiLevel = info.maxVersion ? info.maxVersion : lastApiLevel;
     const bool overloadable = spec.isOverloadable();
 
     /* We track success rather than aborting early in case of failure so that we
      * generate all the error messages.
      */
     bool success = true;
-    for (int apiLevel = minApiLevel; apiLevel <= maxApiLevel; ++apiLevel) {
+    // Use 64-bit integer here for the loop count to avoid overflow
+    // (minApiLevel == maxApiLevel == UINT_MAX for unreleased API)
+    for (int64_t apiLevel = minApiLevel; apiLevel <= maxApiLevel; ++apiLevel) {
         for (auto permutation : spec.getPermutations()) {
             if (info.intSize == 0 || info.intSize == 32) {
                 if (!addFunctionManglingToSet(function, *permutation, overloadable, apiLevel, 32,
@@ -360,13 +364,17 @@
  * to validate unresolved external references.  'lastApiLevel' is the largest api level found in
  * all spec files.
  */
-static bool generateWhiteListFile(int lastApiLevel) {
+static bool generateWhiteListFile(unsigned int lastApiLevel) {
     bool success = true;
     // We generate all the manglings in a set to remove duplicates and to order them.
     set<string> allManglings;
     for (auto f : systemSpecification.getFunctions()) {
         const Function* function = f.second;
         for (auto spec : function->getSpecifications()) {
+            // Compiler intrinsics are not runtime APIs. Do not include them in the whitelist.
+            if (spec->isIntrinsic()) {
+                continue;
+            }
             if (!addManglingsForSpecification(*function, *spec, lastApiLevel, &allManglings)) {
                 success = false;  // We continue so we can generate all errors.
             }
@@ -444,7 +452,7 @@
  * This file can be used to verify the white list that's also generated in this file.  To do so,
  * run "llvm-nm -undefined-only -just-symbol-name" on the resulting bit code.
  */
-static bool generateApiTesterFile(const string& slangTestDirectory, int apiLevel) {
+static bool generateApiTesterFile(const string& slangTestDirectory, unsigned int apiLevel) {
     GeneratedFile file;
     if (!file.start(slangTestDirectory, "all" + to_string(apiLevel) + ".rs")) {
         return false;
@@ -470,6 +478,10 @@
     for (auto f : systemSpecification.getFunctions()) {
         const Function* function = f.second;
         for (auto spec : function->getSpecifications()) {
+            // Do not include internal APIs in the API tests.
+            if (spec->isInternal()) {
+                continue;
+            }
             VersionInfo info = spec->getVersionInfo();
             if (!info.includesVersion(apiLevel)) {
                 continue;
@@ -503,13 +515,13 @@
     return true;
 }
 
-bool generateStubsWhiteList(const string& slangTestDirectory, int maxApiLevel) {
-    int lastApiLevel = min(systemSpecification.getMaximumApiLevel(), maxApiLevel);
+bool generateStubsWhiteList(const string& slangTestDirectory, unsigned int maxApiLevel) {
+    unsigned int lastApiLevel = min(systemSpecification.getMaximumApiLevel(), maxApiLevel);
     if (!generateWhiteListFile(lastApiLevel)) {
         return false;
     }
     // Generate a test file for each apiLevel.
-    for (int i = kMinimumApiLevelForTests; i <= lastApiLevel; ++i) {
+    for (unsigned int i = kMinimumApiLevelForTests; i <= lastApiLevel; ++i) {
         if (!generateApiTesterFile(slangTestDirectory, i)) {
             return false;
         }
diff --git a/api/GenerateTestFiles.cpp b/api/GenerateTestFiles.cpp
index ddb7c78..45cf1f3 100644
--- a/api/GenerateTestFiles.cpp
+++ b/api/GenerateTestFiles.cpp
@@ -46,7 +46,7 @@
 }
 
 // Returns true if any permutation of the function have tests to b
-static bool needTestFiles(const Function& function, int versionOfTestFiles) {
+static bool needTestFiles(const Function& function, unsigned int versionOfTestFiles) {
     for (auto spec : function.getSpecifications()) {
         if (spec->hasTests(versionOfTestFiles)) {
             return true;
@@ -974,7 +974,7 @@
  * to test.
  */
 static bool writeTestFilesForFunction(const Function& function, const string& directory,
-                                      int versionOfTestFiles) {
+                                      unsigned int versionOfTestFiles) {
     // Avoid creating empty files if we're not testing this function.
     if (!needTestFiles(function, versionOfTestFiles)) {
         return true;
@@ -1026,7 +1026,7 @@
     return true;
 }
 
-bool generateTestFiles(const string& directory, int versionOfTestFiles) {
+bool generateTestFiles(const string& directory, unsigned int versionOfTestFiles) {
     bool success = true;
     for (auto f : systemSpecification.getFunctions()) {
         if (!writeTestFilesForFunction(*f.second, directory, versionOfTestFiles)) {
diff --git a/api/Generator.cpp b/api/Generator.cpp
index c44c995..456f214 100644
--- a/api/Generator.cpp
+++ b/api/Generator.cpp
@@ -59,7 +59,7 @@
  * Constants are defined as follows:
  *
  * constant:  {The name of the constant.}
- * [version: {Starting API level} [ {Last API level that supports this.}]
+ * [version: ({Starting API level} [ {Last API level that supports this.}] | UNRELEASED)
  * [size: {32 or 64.  Used if this is available only for 32 or 64 bit code.}]
  * value: {The value of the constant.}
  * [hidden:]   ...If present, don't document the constant.  Omit the following two fields.
@@ -73,7 +73,7 @@
  * Types can either be simple types, structs, or enums.  They have the format:
  *
  * type:  {The typedef name of the type.}
- * [version: {Starting API level} [ {Last API level that supports this.}]
+ * [version: ({Starting API level} [ {Last API level that supports this.}] | UNRELEASED)
  * [size: {32 or 64.  Used if this is available only for 32 or 64 bit code.}]
  * simple: {The C declaration that this type is the typedef equivalent.}
  * [hidden:]   ...If present, don't document the type.  Omit the following two fields.
@@ -85,7 +85,7 @@
  * end:
  *
  * type:  {The typedef name of the type.}
- * [version: {Starting API level} [ {Last API level that supports this.}]
+ * [version: ({Starting API level} [ {Last API level that supports this.}] | UNRELEASED)
  * [size: {32 or 64.  Used if this is available only for 32 or 64 bit code.}]
  * struct: [{The name that will appear right after the struct keyword}]
  * field: {Type and name of the field}[, "{One line documentation of the field}"]
@@ -99,7 +99,7 @@
  * end:
  *
  * type:  {The typedef name of the type.}
- * [version: {Starting API level} [ {Last API level that supports this.}]
+ * [version: ({Starting API level} [ {Last API level that supports this.}] | UNRELEASED)
  * [size: {32 or 64.  Used if this is available only for 32 or 64 bit code.}]
  * enum: [{The name that will appear right after the enum keyword}]
  * value: {Type and name of the field}[, "{One line documentation of the field}"]
@@ -114,7 +114,7 @@
  * Functions have the following format:
  *
  * function:  {The name of the function.}
- * [version: {Starting API level} [ {Last API level that supports this.}]
+ * [version: ({Starting API level} [ {Last API level that supports this.}] | UNRELEASED)
  * [size: {32 or 64.  Used if this is available only for 32 or 64 bit code.}]
  * [attrib: {Attributes of the function.}]
  * [w: {A comma separated list of width supported.  Only 1, 2, 3, 4 are supported.
@@ -122,7 +122,7 @@
  * ... Up to four w: or t: can be defined.  The order matter.  These will be replace
  * ... the #1, #2, #3, #4 that can be found in the rest of the specification.
  * ret: [{The return type} [, "{One line documentation of the return}"]]
- * [arg: {Type}[, {Name}][, {ParameterEntry.testOption}][, "{One line documentation of the field}"]]
+ * [arg:(({Type}[ {Name})]|{Elipsis})[, {ParameterEntry.testOption}][, "{One line documentation of the field}"]]
  * [arg:   ... Same for all the other arguments of the function.]
  * [hidden:]   ... If present, don't include in the HTML documentation.
  * [deprecated: [{Deprecation message.}]   ... This is deprecated.  Compiler will issue a wrning.
@@ -153,7 +153,7 @@
 
 using namespace std;
 
-static bool parseCommandLine(int argc, char* argv[], int* maxApiLevel, bool* forVerification,
+static bool parseCommandLine(int argc, char* argv[], unsigned int* maxApiLevel, bool* forVerification,
                              vector<string>* specFileNames) {
     for (int i = 1; i < argc; i++) {
         if (argv[i][0] == '-') {
@@ -189,7 +189,7 @@
 
 int main(int argc, char* argv[]) {
     // If there's no restriction, generated test files for the very highest version.
-    int maxApiLevel = 999999;
+    unsigned int maxApiLevel = VersionInfo::kUnreleasedVersion;
     vector<string> specFileNames;
     bool forVerification = false;
     if (!parseCommandLine(argc, argv, &maxApiLevel, &forVerification, &specFileNames)) {
diff --git a/api/Generator.h b/api/Generator.h
index 7a9dd28..5d72101 100644
--- a/api/Generator.h
+++ b/api/Generator.h
@@ -21,7 +21,7 @@
 bool generateHeaderFiles(const std::string& directory);
 
 // Generates the Java and RenderScript test files.  The implementation is in GenerateTestFiles.cpp.
-bool generateTestFiles(const std::string& directory, int versionOfTestFiles);
+bool generateTestFiles(const std::string& directory, unsigned int versionOfTestFiles);
 
 /* Generates the documentation files.  The implementation is in GenerateDocumentation.cpp.
  * If forVerification is false (the default), we generate the .jd files needed by the
@@ -33,6 +33,6 @@
  * when testing slang and that can be used to manually verify the white list.
  * The implementation is in GenerateStubsWhiteList.cpp.
  */
-bool generateStubsWhiteList(const std::string& slangTestDirectory, int maxApiLevel);
+bool generateStubsWhiteList(const std::string& slangTestDirectory, unsigned int maxApiLevel);
 
 #endif  // ANDROID_RS_API_GENERATOR_GENERATOR_H
diff --git a/api/Scanner.cpp b/api/Scanner.cpp
index 84af581..1bd4973 100644
--- a/api/Scanner.cpp
+++ b/api/Scanner.cpp
@@ -175,7 +175,14 @@
     if (!isReturn) {
         size_t nameStart = s.rfind(' ');
         if (nameStart == string::npos) {
-            error() << "Missing variable name\n";
+            if (s == "...") {
+                p->name = s;
+                p->type = "";
+                p->lineNumber = mLineNumber;
+                return p;
+            } else {
+                error() << "Missing variable name\n";
+            }
         } else {
             p->name = s.substr(nameStart + 1);
             s.erase(nameStart);
diff --git a/api/Specification.cpp b/api/Specification.cpp
index f02e429..28e5231 100644
--- a/api/Specification.cpp
+++ b/api/Specification.cpp
@@ -32,7 +32,7 @@
 using namespace std;
 
 // API level when RenderScript was added.
-const int MIN_API_LEVEL = 9;
+const unsigned int MIN_API_LEVEL = 9;
 
 const NumericalType TYPES[] = {
             {"f16", "FLOAT_16", "half", "float", FLOATING_POINT, 11, 5},
@@ -50,6 +50,8 @@
 
 const int NUM_TYPES = sizeof(TYPES) / sizeof(TYPES[0]);
 
+static const char kTagUnreleased[] = "UNRELEASED";
+
 // The singleton of the collected information of all the spec files.
 SystemSpecification systemSpecification;
 
@@ -201,26 +203,34 @@
     }
 }
 
-bool VersionInfo::scan(Scanner* scanner, int maxApiLevel) {
+bool VersionInfo::scan(Scanner* scanner, unsigned int maxApiLevel) {
     if (scanner->findOptionalTag("version:")) {
         const string s = scanner->getValue();
-        sscanf(s.c_str(), "%i %i", &minVersion, &maxVersion);
-        if (minVersion && minVersion < MIN_API_LEVEL) {
-            scanner->error() << "Minimum version must >= 9\n";
-        }
-        if (minVersion == MIN_API_LEVEL) {
-            minVersion = 0;
-        }
-        if (maxVersion && maxVersion < MIN_API_LEVEL) {
-            scanner->error() << "Maximum version must >= 9\n";
+        if (s.compare(0, sizeof(kTagUnreleased), kTagUnreleased) == 0) {
+            // The API is still under development and does not have
+            // an official version number.
+            minVersion = maxVersion = kUnreleasedVersion;
+        } else {
+            sscanf(s.c_str(), "%u %u", &minVersion, &maxVersion);
+            if (minVersion && minVersion < MIN_API_LEVEL) {
+                scanner->error() << "Minimum version must >= 9\n";
+            }
+            if (minVersion == MIN_API_LEVEL) {
+                minVersion = 0;
+            }
+            if (maxVersion && maxVersion < MIN_API_LEVEL) {
+                scanner->error() << "Maximum version must >= 9\n";
+            }
         }
     }
     if (scanner->findOptionalTag("size:")) {
         sscanf(scanner->getValue().c_str(), "%i", &intSize);
     }
+
     if (maxVersion > maxApiLevel) {
         maxVersion = maxApiLevel;
     }
+
     return minVersion == 0 || minVersion <= maxApiLevel;
 }
 
@@ -331,7 +341,7 @@
 }
 
 void ConstantSpecification::scanConstantSpecification(Scanner* scanner, SpecFile* specFile,
-                                                      int maxApiLevel) {
+                                                      unsigned int maxApiLevel) {
     string name = scanner->getValue();
     VersionInfo info;
     if (!info.scan(scanner, maxApiLevel)) {
@@ -357,7 +367,7 @@
 }
 
 void TypeSpecification::scanTypeSpecification(Scanner* scanner, SpecFile* specFile,
-                                              int maxApiLevel) {
+                                              unsigned int maxApiLevel) {
     string name = scanner->getValue();
     VersionInfo info;
     if (!info.scan(scanner, maxApiLevel)) {
@@ -522,7 +532,7 @@
     }
 }
 
-bool FunctionSpecification::hasTests(int versionOfTestFiles) const {
+bool FunctionSpecification::hasTests(unsigned int versionOfTestFiles) const {
     if (mVersionInfo.maxVersion != 0 && mVersionInfo.maxVersion < versionOfTestFiles) {
         return false;
     }
@@ -533,7 +543,7 @@
 }
 
 void FunctionSpecification::scanFunctionSpecification(Scanner* scanner, SpecFile* specFile,
-                                                      int maxApiLevel) {
+                                                      unsigned int maxApiLevel) {
     // Some functions like convert have # part of the name.  Truncate at that point.
     const string& unexpandedName = scanner->getValue();
     string name = unexpandedName;
@@ -562,6 +572,12 @@
     spec->mTest = "scalar";  // default
     spec->mVersionInfo = info;
 
+    if (scanner->findOptionalTag("internal:")) {
+        spec->mInternal = (scanner->getValue() == "true");
+    }
+    if (scanner->findOptionalTag("intrinsic:")) {
+        spec->mIntrinsic = (scanner->getValue() == "true");
+    }
     if (scanner->findOptionalTag("attrib:")) {
         spec->mAttribute = scanner->getValue();
     }
@@ -711,7 +727,7 @@
 }
 
 // Read the specification, adding the definitions to the global functions map.
-bool SpecFile::readSpecFile(int maxApiLevel) {
+bool SpecFile::readSpecFile(unsigned int maxApiLevel) {
     FILE* specFile = fopen(mSpecFileName.c_str(), "rt");
     if (!specFile) {
         cerr << "Error opening input file: " << mSpecFileName << "\n";
@@ -804,7 +820,7 @@
     return findOrCreate<Function>(name, &mFunctions, created);
 }
 
-bool SystemSpecification::readSpecFile(const string& fileName, int maxApiLevel) {
+bool SystemSpecification::readSpecFile(const string& fileName, unsigned int maxApiLevel) {
     SpecFile* spec = new SpecFile(fileName);
     if (!spec->readSpecFile(maxApiLevel)) {
         cerr << fileName << ": Failed to parse.\n";
@@ -815,12 +831,16 @@
 }
 
 
-static void updateMaxApiLevel(const VersionInfo& info, int* maxApiLevel) {
+static void updateMaxApiLevel(const VersionInfo& info, unsigned int* maxApiLevel) {
+    if (info.minVersion == VersionInfo::kUnreleasedVersion) {
+        // Ignore development API level in consideration of max API level.
+        return;
+    }
     *maxApiLevel = max(*maxApiLevel, max(info.minVersion, info.maxVersion));
 }
 
-int SystemSpecification::getMaximumApiLevel() {
-    int maxApiLevel = 0;
+unsigned int SystemSpecification::getMaximumApiLevel() {
+    unsigned int maxApiLevel = 0;
     for (auto i : mConstants) {
         for (auto j: i.second->getSpecifications()) {
             updateMaxApiLevel(j->getVersionInfo(), &maxApiLevel);
@@ -839,7 +859,7 @@
     return maxApiLevel;
 }
 
-bool SystemSpecification::generateFiles(bool forVerification, int maxApiLevel) const {
+bool SystemSpecification::generateFiles(bool forVerification, unsigned int maxApiLevel) const {
     bool success = generateHeaderFiles("scriptc") &&
                    generateDocumentation("docs", forVerification) &&
                    generateTestFiles("test", maxApiLevel) &&
diff --git a/api/Specification.h b/api/Specification.h
index 87969a6..d3fbad5 100644
--- a/api/Specification.h
+++ b/api/Specification.h
@@ -19,6 +19,7 @@
 
 // See Generator.cpp for documentation of the .spec file format.
 
+#include <climits>
 #include <fstream>
 #include <list>
 #include <map>
@@ -124,8 +125,8 @@
      * If non zero, both versions should be at least 9, the API level that introduced
      * RenderScript.
      */
-    int minVersion;
-    int maxVersion;
+    unsigned int minVersion;
+    unsigned int maxVersion;
     // Either 0, 32 or 64.  If 0, this definition is valid for both 32 and 64 bits.
     int intSize;
 
@@ -134,12 +135,14 @@
      * we are interested in.  This may alter maxVersion.  This method returns false if the
      * minVersion is greater than the maxApiLevel.
      */
-    bool scan(Scanner* scanner, int maxApiLevel);
+    bool scan(Scanner* scanner, unsigned int maxApiLevel);
     /* Return true if the target can be found whitin the range. */
     bool includesVersion(int target) const {
         return (minVersion == 0 || target >= minVersion) &&
                (maxVersion == 0 || target <= maxVersion);
     }
+
+    static constexpr unsigned int kUnreleasedVersion = UINT_MAX;
 };
 
 // We have three type of definitions
@@ -266,7 +269,7 @@
     std::string getValue() const { return mValue; }
 
     // Parse a constant specification and add it to specFile.
-    static void scanConstantSpecification(Scanner* scanner, SpecFile* specFile, int maxApiLevel);
+    static void scanConstantSpecification(Scanner* scanner, SpecFile* specFile, unsigned int maxApiLevel);
 };
 
 enum TypeKind {
@@ -313,7 +316,7 @@
     const std::vector<std::string>& getValueComments() const { return mValueComments; }
 
     // Parse a type specification and add it to specFile.
-    static void scanTypeSpecification(Scanner* scanner, SpecFile* specFile, int maxApiLevel);
+    static void scanTypeSpecification(Scanner* scanner, SpecFile* specFile, unsigned int maxApiLevel);
 };
 
 // Maximum number of placeholders (like #1, #2) in function specifications.
@@ -343,6 +346,9 @@
      * "": Don't test.  This is the default.
      */
     std::string mTest;
+    bool mInternal;               // Internal. Not visible to users. (Default: false)
+    bool mIntrinsic;              // Compiler intrinsic that is lowered to an internal API.
+                                  // (Default: false)
     std::string mAttribute;       // Function attributes.
     std::string mPrecisionLimit;  // Maximum precision required when checking output of this
                                   // function.
@@ -379,10 +385,13 @@
     void createPermutations(Function* function, Scanner* scanner);
 
 public:
-    FunctionSpecification(Function* function) : mFunction(function), mReturn(nullptr) {}
+    FunctionSpecification(Function* function) : mFunction(function), mInternal(false),
+        mIntrinsic(false), mReturn(nullptr) {}
     ~FunctionSpecification();
 
     Function* getFunction() const { return mFunction; }
+    bool isInternal() const { return mInternal; }
+    bool isIntrinsic() const { return mIntrinsic; }
     std::string getAttribute() const { return mAttribute; }
     std::string getTest() const { return mTest; }
     std::string getPrecisionLimit() const { return mPrecisionLimit; }
@@ -402,7 +411,7 @@
     void parseTest(Scanner* scanner);
 
     // Return true if we need to generate tests for this function.
-    bool hasTests(int versionOfTestFiles) const;
+    bool hasTests(unsigned int versionOfTestFiles) const;
 
     bool hasInline() const { return mInline.size() > 0; }
 
@@ -415,7 +424,7 @@
     }
 
     // Parse a function specification and add it to specFile.
-    static void scanFunctionSpecification(Scanner* scanner, SpecFile* specFile, int maxApiLevel);
+    static void scanFunctionSpecification(Scanner* scanner, SpecFile* specFile, unsigned int maxApiLevel);
 };
 
 /* A concrete version of a function specification, where all placeholders have been replaced by
@@ -527,7 +536,7 @@
                !mDocumentedFunctions.empty();
     }
 
-    bool readSpecFile(int maxApiLevel);
+    bool readSpecFile(unsigned int maxApiLevel);
 
     /* These are called by the parser to keep track of the specifications defined in this file.
      * hasDocumentation is true if this specification containes the documentation.
@@ -562,9 +571,9 @@
     /* Parse the spec file and create the object hierarchy, adding a pointer to mSpecFiles.
      * We won't include information passed the specified level.
      */
-    bool readSpecFile(const std::string& fileName, int maxApiLevel);
+    bool readSpecFile(const std::string& fileName, unsigned int maxApiLevel);
     // Generate all the files.
-    bool generateFiles(bool forVerification, int maxApiLevel) const;
+    bool generateFiles(bool forVerification, unsigned int maxApiLevel) const;
 
     const std::vector<SpecFile*>& getSpecFiles() const { return mSpecFiles; }
     const std::map<std::string, Constant*>& getConstants() const { return mConstants; }
@@ -575,7 +584,7 @@
     std::string getHtmlAnchor(const std::string& name) const;
 
     // Returns the maximum API level specified in any spec file.
-    int getMaximumApiLevel();
+    unsigned int getMaximumApiLevel();
 };
 
 // Singleton that represents the collection of all the specs we're processing.
diff --git a/api/Utilities.cpp b/api/Utilities.cpp
index 4268278..841d824 100644
--- a/api/Utilities.cpp
+++ b/api/Utilities.cpp
@@ -163,7 +163,7 @@
 }
 
 string makeAttributeTag(const string& userAttribute, const string& additionalAttribute,
-                        int deprecatedApiLevel, const string& deprecatedMessage) {
+                        unsigned int deprecatedApiLevel, const string& deprecatedMessage) {
     ostringstream stream;
     bool needComma = false;
     if (userAttribute[0] == '=') {
diff --git a/api/Utilities.h b/api/Utilities.h
index cd0db72..eced68d 100644
--- a/api/Utilities.h
+++ b/api/Utilities.h
@@ -51,7 +51,7 @@
  * use the additionalAttribute.  An empty string will be returned if there are no attributes.
  */
 std::string makeAttributeTag(const std::string& userAttribute,
-                             const std::string& additionalAttribute, int deprecatedApiLevel,
+                             const std::string& additionalAttribute, unsigned int deprecatedApiLevel,
                              const std::string& deprecatedMessage);
 
 /* This class is used to generate one source file.  There will be one instance
diff --git a/api/generate.sh b/api/generate.sh
index 3ff882f..55d3f04 100755
--- a/api/generate.sh
+++ b/api/generate.sh
@@ -50,7 +50,7 @@
   do
     mv slangtest/all$i.rs ../../compile/slang/tests/P_all_api_$i
 done
-rmdir slangtest
+rm -rf slangtest
 
 mv RSStubsWhiteList.cpp ../../compile/libbcc/lib/Renderscript/
 
diff --git a/api/rs_for_each.spec b/api/rs_for_each.spec
index c0c09b2..c9bb2e6 100644
--- a/api/rs_for_each.spec
+++ b/api/rs_for_each.spec
@@ -83,6 +83,15 @@
  over cells 4, 5, 6, and 7 in the X dimension, set xStart to 4 and xEnd to 8.
 end:
 
+type: rs_kernel
+version: UNRELEASED
+simple: void*
+summary: Handle to a kernel function
+description:
+  An opaque type for a function that is defined with the kernel attribute.  A value
+  of this type can be used in a @rsForEach call to launch a kernel.
+end:
+
 function: rsForEach
 version: 9 13
 ret: void
@@ -91,26 +100,34 @@
 arg: rs_allocation output, "Allocation to write date into."
 arg: const void* usrData, "User defined data to pass to the script.  May be NULL."
 arg: const rs_script_call_t* sc, "Extra control information used to select a sub-region of the allocation to be processed or suggest a walking strategy.  May be NULL."
-summary: Invoke the root kernel of a script
+summary: Launches a kernel
 description:
- Invoke the kernel named "root" of the specified script.  Like other kernels, this root()
- function will be invoked repeatedly over the cells of the specificed allocation, filling
- the output allocation with the results.
+ Runs the kernel over zero or more input allocations. They are passed after the
+ @rs_kernel argument. If the specified kernel returns a value, an output allocation
+ must be specified as the last argument. All input allocations,
+ and the output allocation if it exists, must have the same dimensions.
 
- When rsForEach is called, the root script is launched immediately.  rsForEach returns
- only when the script has completed and the output allocation is ready to use.
+ This is a synchronous function. A call to this function only returns after all
+ the work has completed for all cells of the input allocations. If the kernel
+ function returns any value, the call waits until all results have been written
+ to the output allocation.
 
- The rs_script argument is typically initialized using a global variable set from Java.
+ Up to API level 23, the kernel is implicitly specified as the kernel named
+ "root" in the specified script, and only a single input allocation can be used.
+ Starting in API level *UNRELEASED*, an arbitrary kernel function can be used,
+ as specified by the kernel argument. The script argument is removed.
+ The kernel must be defined in the current script. In addition, more than one
+ inputs can be used.
 
- The kernel can be invoked with just an input allocation or just an output allocation.
- This can be done by defining an rs_allocation variable and not initializing it.  E.g.<code><br/>
- rs_script gCustomScript;<br/>
- void specializedProcessing(rs_allocation in) {<br/>
- &nbsp;&nbsp;rs_allocation ignoredOut;<br/>
- &nbsp;&nbsp;rsForEach(gCustomScript, in, ignoredOut);<br/>
- }<br/></code>
-
- If both input and output allocations are specified, they must have the same dimensions.
+E.g.<code><br/>
+ float __attribute__((kernel)) square(float a) {<br/>
+ &nbsp;&nbsp;return a * a;<br/>
+ }<br/>
+<br/>
+ void compute(rs_allocation ain, rs_allocation aout) {<br/>
+ &nbsp;&nbsp;rsForEach(square, ain, aout);<br/>
+ }<br/>
+<br/></code>
 test: none
 end:
 
@@ -148,7 +165,7 @@
 end:
 
 function: rsForEach
-version: 14
+version: 14 23
 ret: void
 arg: rs_script script
 arg: rs_allocation input
@@ -156,6 +173,59 @@
 test: none
 end:
 
+function: rsForEach
+version: UNRELEASED
+intrinsic: true
+attrib: =  # Not overloadable
+ret: void
+arg: rs_kernel kernel, "Function designator to a function that is defined with the kernel attribute."
+arg: ..., "Input and output allocations"
+test: none
+end:
+
+function: rsForEachWithOptions
+version: UNRELEASED
+intrinsic: true
+attrib: =  # Not overloadable
+ret: void
+arg: rs_kernel kernel, "Function designator to a function that is defined with the kernel attribute."
+arg: rs_script_call_t* options, "Launch options"
+arg: ..., "Input and output allocations"
+summary: Launches a kernel with options
+description:
+ Launches kernel in a way similar to @rsForEach. However, instead of processing
+ all cells in the input, this function only processes cells in the subspace of
+ the index space specified in options. With the index space explicitly specified
+ by options, no input or output allocation is required for a kernel launch using
+ this API. If allocations are passed in, they must match the number of arguments
+ and return value expected by the kernel function. The output allocation is
+ present if and only if the kernel has a non-void return value.
+
+ E.g., <code><br/>
+    rs_script_call_t opts = {0};<br/>
+    opts.xStart = 0;<br/>
+    opts.xEnd = dimX;<br/>
+    opts.yStart = 0;<br/>
+    opts.yEnd = dimY / 2;<br/>
+    rsForEachWithOptions(foo, &opts, out, out);<br/>
+</code>
+
+test: none
+end:
+
+function: rsForEachInternal
+version: UNRELEASED
+internal: true
+ret: void
+arg: int slot
+arg: rs_script_call_t* options
+arg: rs_allocation input
+arg: rs_allocation output
+summary: (Internal API) Launch a kernel in the current Script (with the slot number)
+description:
+test: none
+end:
+
 function: rsGetArray0
 version: 23
 ret: uint32_t
diff --git a/cpp/rsDispatch.cpp b/cpp/rsDispatch.cpp
index fd09c87..ce147d9 100644
--- a/cpp/rsDispatch.cpp
+++ b/cpp/rsDispatch.cpp
@@ -20,7 +20,7 @@
 #include <dlfcn.h>
 #include <limits.h>
 
-#define LOG_API(...)
+#define LOG_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "RS Dispatch", __VA_ARGS__);
 #define REDUCE_API_LEVEL INT_MAX
 
 bool loadSymbols(void* handle, dispatchTable& dispatchTab, int device_api) {
@@ -28,340 +28,340 @@
     // Function to set the native lib path for 64bit compat lib.
     dispatchTab.SetNativeLibDir = (SetNativeLibDirFnPtr)dlsym(handle, "rsaContextSetNativeLibDir");
     if (dispatchTab.SetNativeLibDir == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.SetNativeLibDir");
+        LOG_ERR("Couldn't initialize dispatchTab.SetNativeLibDir");
         return false;
     }
 #endif
     dispatchTab.AllocationGetType = (AllocationGetTypeFnPtr)dlsym(handle, "rsaAllocationGetType");
     if (dispatchTab.AllocationGetType == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationGetType");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationGetType");
         return false;
     }
     dispatchTab.TypeGetNativeData = (TypeGetNativeDataFnPtr)dlsym(handle, "rsaTypeGetNativeData");
     if (dispatchTab.TypeGetNativeData == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.TypeGetNativeData");
+        LOG_ERR("Couldn't initialize dispatchTab.TypeGetNativeData");
         return false;
     }
     dispatchTab.ElementGetNativeData = (ElementGetNativeDataFnPtr)dlsym(handle, "rsaElementGetNativeData");
     if (dispatchTab.ElementGetNativeData == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ElementGetNativeData");
+        LOG_ERR("Couldn't initialize dispatchTab.ElementGetNativeData");
         return false;
     }
     dispatchTab.ElementGetSubElements = (ElementGetSubElementsFnPtr)dlsym(handle, "rsaElementGetSubElements");
     if (dispatchTab.ElementGetSubElements == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ElementGetSubElements");
+        LOG_ERR("Couldn't initialize dispatchTab.ElementGetSubElements");
         return false;
     }
     dispatchTab.DeviceCreate = (DeviceCreateFnPtr)dlsym(handle, "rsDeviceCreate");
     if (dispatchTab.DeviceCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.DeviceCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.DeviceCreate");
         return false;
     }
     dispatchTab.DeviceDestroy = (DeviceDestroyFnPtr)dlsym(handle, "rsDeviceDestroy");
     if (dispatchTab.DeviceDestroy == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.DeviceDestroy");
+        LOG_ERR("Couldn't initialize dispatchTab.DeviceDestroy");
         return false;
     }
     dispatchTab.DeviceSetConfig = (DeviceSetConfigFnPtr)dlsym(handle, "rsDeviceSetConfig");
     if (dispatchTab.DeviceSetConfig == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.DeviceSetConfig");
+        LOG_ERR("Couldn't initialize dispatchTab.DeviceSetConfig");
         return false;
     }
     dispatchTab.ContextCreate = (ContextCreateFnPtr)dlsym(handle, "rsContextCreate");;
     if (dispatchTab.ContextCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextCreate");
         return false;
     }
     dispatchTab.GetName = (GetNameFnPtr)dlsym(handle, "rsaGetName");;
     if (dispatchTab.GetName == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.GetName");
+        LOG_ERR("Couldn't initialize dispatchTab.GetName");
         return false;
     }
     dispatchTab.ContextDestroy = (ContextDestroyFnPtr)dlsym(handle, "rsContextDestroy");
     if (dispatchTab.ContextDestroy == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextDestroy");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextDestroy");
         return false;
     }
     dispatchTab.ContextGetMessage = (ContextGetMessageFnPtr)dlsym(handle, "rsContextGetMessage");
     if (dispatchTab.ContextGetMessage == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextGetMessage");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextGetMessage");
         return false;
     }
     dispatchTab.ContextPeekMessage = (ContextPeekMessageFnPtr)dlsym(handle, "rsContextPeekMessage");
     if (dispatchTab.ContextPeekMessage == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextPeekMessage");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextPeekMessage");
         return false;
     }
     dispatchTab.ContextSendMessage = (ContextSendMessageFnPtr)dlsym(handle, "rsContextSendMessage");
     if (dispatchTab.ContextSendMessage == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextSendMessage");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextSendMessage");
         return false;
     }
     dispatchTab.ContextInitToClient = (ContextInitToClientFnPtr)dlsym(handle, "rsContextInitToClient");
     if (dispatchTab.ContextInitToClient == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextInitToClient");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextInitToClient");
         return false;
     }
     dispatchTab.ContextDeinitToClient = (ContextDeinitToClientFnPtr)dlsym(handle, "rsContextDeinitToClient");
     if (dispatchTab.ContextDeinitToClient == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextDeinitToClient");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextDeinitToClient");
         return false;
     }
     dispatchTab.TypeCreate = (TypeCreateFnPtr)dlsym(handle, "rsTypeCreate");
     if (dispatchTab.TypeCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.TypeCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.TypeCreate");
         return false;
     }
     dispatchTab.AllocationCreateTyped = (AllocationCreateTypedFnPtr)dlsym(handle, "rsAllocationCreateTyped");
     if (dispatchTab.AllocationCreateTyped == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationCreateTyped");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationCreateTyped");
         return false;
     }
     dispatchTab.AllocationCreateFromBitmap = (AllocationCreateFromBitmapFnPtr)dlsym(handle, "rsAllocationCreateFromBitmap");
     if (dispatchTab.AllocationCreateFromBitmap == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationCreateFromBitmap");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationCreateFromBitmap");
         return false;
     }
     dispatchTab.AllocationCubeCreateFromBitmap = (AllocationCubeCreateFromBitmapFnPtr)dlsym(handle, "rsAllocationCubeCreateFromBitmap");
     if (dispatchTab.AllocationCubeCreateFromBitmap == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationCubeCreateFromBitmap");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationCubeCreateFromBitmap");
         return false;
     }
     dispatchTab.AllocationGetSurface = (AllocationGetSurfaceFnPtr)dlsym(handle, "rsAllocationGetSurface");
     if (dispatchTab.AllocationGetSurface == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationGetSurface");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationGetSurface");
         return false;
     }
     dispatchTab.AllocationSetSurface = (AllocationSetSurfaceFnPtr)dlsym(handle, "rsAllocationSetSurface");
     if (dispatchTab.AllocationSetSurface == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationSetSurface");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationSetSurface");
         return false;
     }
     dispatchTab.ContextFinish = (ContextFinishFnPtr)dlsym(handle, "rsContextFinish");
     if (dispatchTab.ContextFinish == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextFinish");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextFinish");
         return false;
     }
     dispatchTab.ContextDump = (ContextDumpFnPtr)dlsym(handle, "rsContextDump");
     if (dispatchTab.ContextDump == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextDump");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextDump");
         return false;
     }
     dispatchTab.ContextSetPriority = (ContextSetPriorityFnPtr)dlsym(handle, "rsContextSetPriority");
     if (dispatchTab.ContextSetPriority == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ContextSetPriority");
+        LOG_ERR("Couldn't initialize dispatchTab.ContextSetPriority");
         return false;
     }
     dispatchTab.AssignName = (AssignNameFnPtr)dlsym(handle, "rsAssignName");
     if (dispatchTab.AssignName == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AssignName");
+        LOG_ERR("Couldn't initialize dispatchTab.AssignName");
         return false;
     }
     dispatchTab.ObjDestroy = (ObjDestroyFnPtr)dlsym(handle, "rsObjDestroy");
     if (dispatchTab.ObjDestroy == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ObjDestroy");
+        LOG_ERR("Couldn't initialize dispatchTab.ObjDestroy");
         return false;
     }
     dispatchTab.ElementCreate = (ElementCreateFnPtr)dlsym(handle, "rsElementCreate");
     if (dispatchTab.ElementCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ElementCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.ElementCreate");
         return false;
     }
     dispatchTab.ElementCreate2 = (ElementCreate2FnPtr)dlsym(handle, "rsElementCreate2");
     if (dispatchTab.ElementCreate2 == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ElementCreate2");
+        LOG_ERR("Couldn't initialize dispatchTab.ElementCreate2");
         return false;
     }
     dispatchTab.AllocationCopyToBitmap = (AllocationCopyToBitmapFnPtr)dlsym(handle, "rsAllocationCopyToBitmap");
     if (dispatchTab.AllocationCopyToBitmap == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationCopyToBitmap");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationCopyToBitmap");
         return false;
     }
     dispatchTab.Allocation1DData = (Allocation1DDataFnPtr)dlsym(handle, "rsAllocation1DData");
     if (dispatchTab.Allocation1DData == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.Allocation1DData");
+        LOG_ERR("Couldn't initialize dispatchTab.Allocation1DData");
         return false;
     }
     dispatchTab.Allocation1DElementData = (Allocation1DElementDataFnPtr)dlsym(handle, "rsAllocation1DElementData");
     if (dispatchTab.Allocation1DElementData == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.Allocation1DElementData");
+        LOG_ERR("Couldn't initialize dispatchTab.Allocation1DElementData");
         return false;
     }
     dispatchTab.Allocation2DData = (Allocation2DDataFnPtr)dlsym(handle, "rsAllocation2DData");
     if (dispatchTab.Allocation2DData == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.Allocation2DData");
+        LOG_ERR("Couldn't initialize dispatchTab.Allocation2DData");
         return false;
     }
     dispatchTab.Allocation3DData = (Allocation3DDataFnPtr)dlsym(handle, "rsAllocation3DData");
     if (dispatchTab.Allocation3DData == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.Allocation3DData");
+        LOG_ERR("Couldn't initialize dispatchTab.Allocation3DData");
         return false;
     }
     dispatchTab.AllocationGenerateMipmaps = (AllocationGenerateMipmapsFnPtr)dlsym(handle, "rsAllocationGenerateMipmaps");
     if (dispatchTab.AllocationGenerateMipmaps == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationGenerateMipmaps");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationGenerateMipmaps");
         return false;
     }
     dispatchTab.AllocationRead = (AllocationReadFnPtr)dlsym(handle, "rsAllocationRead");
     if (dispatchTab.AllocationRead == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationRead");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationRead");
         return false;
     }
     dispatchTab.Allocation1DRead = (Allocation1DReadFnPtr)dlsym(handle, "rsAllocation1DRead");
     if (dispatchTab.Allocation1DRead == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.Allocation1DRead");
+        LOG_ERR("Couldn't initialize dispatchTab.Allocation1DRead");
         return false;
     }
     dispatchTab.Allocation2DRead = (Allocation2DReadFnPtr)dlsym(handle, "rsAllocation2DRead");
     if (dispatchTab.Allocation2DRead == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.Allocation2DRead");
+        LOG_ERR("Couldn't initialize dispatchTab.Allocation2DRead");
         return false;
     }
     dispatchTab.AllocationSyncAll = (AllocationSyncAllFnPtr)dlsym(handle, "rsAllocationSyncAll");
     if (dispatchTab.AllocationSyncAll == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationSyncAll");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationSyncAll");
         return false;
     }
     dispatchTab.AllocationResize1D = (AllocationResize1DFnPtr)dlsym(handle, "rsAllocationResize1D");
     if (dispatchTab.AllocationResize1D == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationResize1D");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationResize1D");
         return false;
     }
     dispatchTab.AllocationCopy2DRange = (AllocationCopy2DRangeFnPtr)dlsym(handle, "rsAllocationCopy2DRange");
     if (dispatchTab.AllocationCopy2DRange == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationCopy2DRange");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationCopy2DRange");
         return false;
     }
     dispatchTab.AllocationCopy3DRange = (AllocationCopy3DRangeFnPtr)dlsym(handle, "rsAllocationCopy3DRange");
     if (dispatchTab.AllocationCopy3DRange == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationCopy3DRange");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationCopy3DRange");
         return false;
     }
     dispatchTab.SamplerCreate = (SamplerCreateFnPtr)dlsym(handle, "rsSamplerCreate");
     if (dispatchTab.SamplerCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.SamplerCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.SamplerCreate");
         return false;
     }
     dispatchTab.ScriptBindAllocation = (ScriptBindAllocationFnPtr)dlsym(handle, "rsScriptBindAllocation");
     if (dispatchTab.ScriptBindAllocation == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptBindAllocation");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptBindAllocation");
         return false;
     }
     dispatchTab.ScriptSetTimeZone = (ScriptSetTimeZoneFnPtr)dlsym(handle, "rsScriptSetTimeZone");
     if (dispatchTab.ScriptSetTimeZone == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetTimeZone");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetTimeZone");
         return false;
     }
     dispatchTab.ScriptInvoke = (ScriptInvokeFnPtr)dlsym(handle, "rsScriptInvoke");
     if (dispatchTab.ScriptInvoke == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptInvoke");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptInvoke");
         return false;
     }
     dispatchTab.ScriptInvokeV = (ScriptInvokeVFnPtr)dlsym(handle, "rsScriptInvokeV");
     if (dispatchTab.ScriptInvokeV == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptInvokeV");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptInvokeV");
         return false;
     }
     dispatchTab.ScriptForEach = (ScriptForEachFnPtr)dlsym(handle, "rsScriptForEach");
     if (dispatchTab.ScriptForEach == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptForEach");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptForEach");
         return false;
     }
     dispatchTab.ScriptSetVarI = (ScriptSetVarIFnPtr)dlsym(handle, "rsScriptSetVarI");
     if (dispatchTab.ScriptSetVarI == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetVarI");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetVarI");
         return false;
     }
     dispatchTab.ScriptSetVarObj = (ScriptSetVarObjFnPtr)dlsym(handle, "rsScriptSetVarObj");
     if (dispatchTab.ScriptSetVarObj == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetVarObj");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetVarObj");
         return false;
     }
     dispatchTab.ScriptSetVarJ = (ScriptSetVarJFnPtr)dlsym(handle, "rsScriptSetVarJ");
     if (dispatchTab.ScriptSetVarJ == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetVarJ");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetVarJ");
         return false;
     }
     dispatchTab.ScriptSetVarF = (ScriptSetVarFFnPtr)dlsym(handle, "rsScriptSetVarF");
     if (dispatchTab.ScriptSetVarF == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetVarF");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetVarF");
         return false;
     }
     dispatchTab.ScriptSetVarD = (ScriptSetVarDFnPtr)dlsym(handle, "rsScriptSetVarD");
     if (dispatchTab.ScriptSetVarD == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetVarD");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetVarD");
         return false;
     }
     dispatchTab.ScriptSetVarV = (ScriptSetVarVFnPtr)dlsym(handle, "rsScriptSetVarV");
     if (dispatchTab.ScriptSetVarV == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetVarV");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetVarV");
         return false;
     }
     dispatchTab.ScriptGetVarV = (ScriptGetVarVFnPtr)dlsym(handle, "rsScriptGetVarV");
     if (dispatchTab.ScriptGetVarV == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptGetVarV");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptGetVarV");
         return false;
     }
     dispatchTab.ScriptSetVarVE = (ScriptSetVarVEFnPtr)dlsym(handle, "rsScriptSetVarVE");
     if (dispatchTab.ScriptSetVarVE == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptSetVarVE");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptSetVarVE");
         return false;
     }
     dispatchTab.ScriptCCreate = (ScriptCCreateFnPtr)dlsym(handle, "rsScriptCCreate");
     if (dispatchTab.ScriptCCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptCCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptCCreate");
         return false;
     }
     dispatchTab.ScriptIntrinsicCreate = (ScriptIntrinsicCreateFnPtr)dlsym(handle, "rsScriptIntrinsicCreate");
     if (dispatchTab.ScriptIntrinsicCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptIntrinsicCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptIntrinsicCreate");
         return false;
     }
     dispatchTab.ScriptKernelIDCreate = (ScriptKernelIDCreateFnPtr)dlsym(handle, "rsScriptKernelIDCreate");
     if (dispatchTab.ScriptKernelIDCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptKernelIDCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptKernelIDCreate");
         return false;
     }
     dispatchTab.ScriptFieldIDCreate = (ScriptFieldIDCreateFnPtr)dlsym(handle, "rsScriptFieldIDCreate");
     if (dispatchTab.ScriptFieldIDCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptFieldIDCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptFieldIDCreate");
         return false;
     }
     dispatchTab.ScriptGroupCreate = (ScriptGroupCreateFnPtr)dlsym(handle, "rsScriptGroupCreate");
     if (dispatchTab.ScriptGroupCreate == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptGroupCreate");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptGroupCreate");
         return false;
     }
     dispatchTab.ScriptGroupSetOutput = (ScriptGroupSetOutputFnPtr)dlsym(handle, "rsScriptGroupSetOutput");
     if (dispatchTab.ScriptGroupSetOutput == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptGroupSetOutput");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptGroupSetOutput");
         return false;
     }
     dispatchTab.ScriptGroupSetInput = (ScriptGroupSetInputFnPtr)dlsym(handle, "rsScriptGroupSetInput");
     if (dispatchTab.ScriptGroupSetInput == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptGroupSetInput");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptGroupSetInput");
         return false;
     }
     dispatchTab.ScriptGroupExecute = (ScriptGroupExecuteFnPtr)dlsym(handle, "rsScriptGroupExecute");
     if (dispatchTab.ScriptGroupExecute == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.ScriptGroupExecute");
+        LOG_ERR("Couldn't initialize dispatchTab.ScriptGroupExecute");
         return false;
     }
     dispatchTab.AllocationIoSend = (AllocationIoSendFnPtr)dlsym(handle, "rsAllocationIoSend");
     if (dispatchTab.AllocationIoSend == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationIoSend");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationIoSend");
         return false;
     }
     dispatchTab.AllocationIoReceive = (AllocationIoReceiveFnPtr)dlsym(handle, "rsAllocationIoReceive");
     if (dispatchTab.AllocationIoReceive == NULL) {
-        LOG_API("Couldn't initialize dispatchTab.AllocationIoReceive");
+        LOG_ERR("Couldn't initialize dispatchTab.AllocationIoReceive");
         return false;
     }
     // API_21 functions
     if (device_api >= 21) {
         dispatchTab.AllocationGetPointer = (AllocationGetPointerFnPtr)dlsym(handle, "rsAllocationGetPointer");
         if (dispatchTab.AllocationGetPointer == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.AllocationGetPointer");
+            LOG_ERR("Couldn't initialize dispatchTab.AllocationGetPointer");
             return false;
         }
     }
@@ -370,52 +370,52 @@
         // ScriptGroup V2 functions
         dispatchTab.ScriptInvokeIDCreate = (ScriptInvokeIDCreateFnPtr)dlsym(handle, "rsScriptInvokeIDCreate");
         if (dispatchTab.ScriptInvokeIDCreate == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.ScriptInvokeIDCreate");
+            LOG_ERR("Couldn't initialize dispatchTab.ScriptInvokeIDCreate");
             return false;
         }
         dispatchTab.ClosureCreate = (ClosureCreateFnPtr)dlsym(handle, "rsClosureCreate");
         if (dispatchTab.ClosureCreate == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.ClosureCreate");
+            LOG_ERR("Couldn't initialize dispatchTab.ClosureCreate");
             return false;
         }
         dispatchTab.InvokeClosureCreate = (InvokeClosureCreateFnPtr)dlsym(handle, "rsInvokeClosureCreate");
         if (dispatchTab.InvokeClosureCreate == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.InvokeClosureCreate");
+            LOG_ERR("Couldn't initialize dispatchTab.InvokeClosureCreate");
             return false;
         }
         dispatchTab.ClosureSetArg = (ClosureSetArgFnPtr)dlsym(handle, "rsClosureSetArg");
         if (dispatchTab.ClosureSetArg == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.ClosureSetArg");
+            LOG_ERR("Couldn't initialize dispatchTab.ClosureSetArg");
             return false;
         }
         dispatchTab.ClosureSetGlobal = (ClosureSetGlobalFnPtr)dlsym(handle, "rsClosureSetGlobal");
         if (dispatchTab.ClosureSetGlobal == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.ClosureSetGlobal");
+            LOG_ERR("Couldn't initialize dispatchTab.ClosureSetGlobal");
             return false;
         }
         dispatchTab.ScriptGroup2Create = (ScriptGroup2CreateFnPtr)dlsym(handle, "rsScriptGroup2Create");
         if (dispatchTab.ScriptGroup2Create == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.ScriptGroup2Create");
+            LOG_ERR("Couldn't initialize dispatchTab.ScriptGroup2Create");
             return false;
         }
         dispatchTab.AllocationElementData = (AllocationElementDataFnPtr)dlsym(handle, "rsAllocationElementData");
         if (dispatchTab.AllocationElementData == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.AllocationElementData");
+            LOG_ERR("Couldn't initialize dispatchTab.AllocationElementData");
             return false;
         }
         dispatchTab.AllocationElementRead = (AllocationElementReadFnPtr)dlsym(handle, "rsAllocationElementRead");
         if (dispatchTab.AllocationElementRead == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.AllocationElementRead");
+            LOG_ERR("Couldn't initialize dispatchTab.AllocationElementRead");
             return false;
         }
         dispatchTab.Allocation3DRead = (Allocation3DReadFnPtr)dlsym(handle, "rsAllocation3DRead");
         if (dispatchTab.Allocation3DRead == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.Allocation3DRead");
+            LOG_ERR("Couldn't initialize dispatchTab.Allocation3DRead");
             return false;
         }
         dispatchTab.ScriptForEachMulti = (ScriptForEachMultiFnPtr)dlsym(handle, "rsScriptForEachMulti");
         if (dispatchTab.ScriptForEachMulti == NULL) {
-            LOG_API("Couldn't initialize dispatchTab.ScriptForEachMulti");
+            LOG_ERR("Couldn't initialize dispatchTab.ScriptForEachMulti");
             return false;
         }
     }
@@ -423,7 +423,7 @@
     if (device_api >= REDUCE_API_LEVEL) {
         dispatchTab.ScriptReduce = (ScriptReduceFnPtr)dlsym(handle, "rsScriptReduce");
         if (dispatchTab.ScriptReduce == nullptr) {
-            LOG_API("Couldn't initialize dispatchTab.ScriptReduce");
+            LOG_ERR("Couldn't initialize dispatchTab.ScriptReduce");
             return false;
         }
     }
@@ -436,7 +436,7 @@
 bool loadIOSuppSyms(void* handleIO, ioSuppDT& ioDispatch){
     ioDispatch.sAllocationSetSurface = (sAllocationSetSurfaceFnPtr)dlsym(handleIO, "AllocationSetSurface");
     if (ioDispatch.sAllocationSetSurface == NULL) {
-        LOG_API("Couldn't initialize ioDispatch.sAllocationSetSurface");
+        LOG_ERR("Couldn't initialize ioDispatch.sAllocationSetSurface");
         return false;
     }
     return true;
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index c2f565c..c816c7d 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -53,9 +53,6 @@
     rsCpuIntrinsics_advsimd_YuvToRGB.S
 #    rsCpuIntrinsics_advsimd_Blend.S \
 
-# Clang does not support nested .irp in *_Blur.S
-LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
-
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
     LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_NEON
 endif
@@ -72,8 +69,6 @@
     rsCpuIntrinsics_neon_YuvToRGB.S \
 
     LOCAL_ASFLAGS_arm := -mfpu=neon
-    # Clang does not support nested .irp in *_Blur.S
-    LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
 endif
 
 ifeq ($(ARCH_X86_HAVE_SSSE3),true)
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index 7ea80a0..f73290f 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -18,6 +18,8 @@
 #define PRIVATE(f) .text; .align 4; .type f,#function; f:
 #define END(f) .size f, .-f;
 
+//#define ARCH_ARM64_USE_BLUR_PRELOAD
+
 .set FRACTION_BITS, 7
 .set MAX_R, 25
 
@@ -32,6 +34,15 @@
 .endif
 .endm
 
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
+#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
 /* Fetch 16 columns of bytes (regardless of image format), convolve these
  * vertically, and leave them in the register file.  If working near the top or
  * bottom of an image then clamp the addressing while loading the data in.
@@ -71,7 +82,7 @@
             mov         x10, x15
 
             uxtl        v14.8h, v15.8b
-//            prfm        PLDL1KEEP,[x1, #16] // TODO: confirm
+            VERTPLD(x1, #16)
             uxtl2       v15.8h, v15.16b
   .if \max_r < 16 // approximate
     ifcc    adr         \reg, 1f
@@ -89,40 +100,128 @@
             umull2      v15.4s, v15.8h, v0.h[0]
             br          \reg
 
-  .irp rowclamp, 1, 0
-    .set cc, \rowclamp
-    .align 4
-    // clang does not support nested .irp
-    .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
-        .set i, \dreg * 8 + \lane
-        .if 0 < i && i <= \max_r
+  /* This version of the vertical fetch loop body is used away from the edges
+   * of the source image.  The pointers start at the top and bottom source rows
+   * and work their way towards the centre on each iteration.  This way the
+   * number of taps used can be controlled by jumping directly into the middle
+   * of the loop and running to completion.
+   * If the loop body changes size then the code which caculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_noclamp i, dreg
+    .if 0 < \i && \i <= \max_r
             ld1         {v10.16b}, [x10], x2
-    ifcc    cmp         x6, #i
             ld1         {v11.16b}, [x11], x13
-    ifcc    csel        x10, x15, x10, lo
             uaddl       v16.8h, v10.8b, v11.8b
-    ifcc    cmp         x7, #i
             uaddl2      v11.8h, v10.16b, v11.16b
-    ifcc    csel        x11, x19, x11, lo
-            umlal       v12.4s, v16.4h, v\dreg\doth[\lane]
-            umlal2      v13.4s, v16.8h, v\dreg\doth[\lane]
-//            prfm        PLDL1KEEP,[x10, #32] // TODO: confirm
-nop
-            umlal       v14.4s, v11.4h, v\dreg\doth[\lane]
-//            prfm        PLDL1KEEP,[x11, #32] // TODO: confirm
-nop
-            umlal2      v15.4s, v11.8h, v\dreg\doth[\lane]
-        .endif
-    .endr ; .endr ; .endr
-    .if \rowclamp == 1
-        1: \labelc :
-            b           2f
-    .else
-        2: \labelnc :
+            umlal       v12.4s, v16.4h, \dreg
+            umlal2      v13.4s, v16.8h, \dreg
+            VERTPLD(x10, #32)
+            umlal       v14.4s, v11.4h, \dreg
+            VERTPLD(x11, #32)
+            umlal2      v15.4s, v11.8h, \dreg
     .endif
-  .endr
+  .endm
 
-            uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
+  /* This version of the vertical fetch loop body is used near the edges of the
+   * source image, where one or both of the accesses may start with a clamped
+   * value, and the row addresses only begin to change after some number of
+   * iterations before the end.
+   * If the loop body changes size then the code which caculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_clamped i, dreg
+    .if 0 < \i && \i <= \max_r
+            ld1         {v10.16b}, [x10], x2
+            cmp         x6, #\i
+            ld1         {v11.16b}, [x11], x13
+            csel        x10, x15, x10, lo
+            uaddl       v16.8h, v10.8b, v11.8b
+            cmp         x7, #\i
+            uaddl2      v11.8h, v10.16b, v11.16b
+            csel        x11, x19, x11, lo
+            umlal       v12.4s, v16.4h, \dreg
+            umlal2      v13.4s, v16.8h, \dreg
+            VERTPLD(x10, #32)
+            umlal       v14.4s, v11.4h, \dreg
+            VERTPLD(x11, #32)
+            umlal2      v15.4s, v11.8h, \dreg
+    .endif
+  .endm
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelc at the end of the block.
+   */
+  .align 4
+  vertfetch_clamped 27, v3.h[3]
+  vertfetch_clamped 26, v3.h[2]
+  vertfetch_clamped 25, v3.h[1]
+  vertfetch_clamped 24, v3.h[0]
+  vertfetch_clamped 23, v2.h[7]
+  vertfetch_clamped 22, v2.h[6]
+  vertfetch_clamped 21, v2.h[5]
+  vertfetch_clamped 20, v2.h[4]
+  vertfetch_clamped 19, v2.h[3]
+  vertfetch_clamped 18, v2.h[2]
+  vertfetch_clamped 17, v2.h[1]
+  vertfetch_clamped 16, v2.h[0]
+  vertfetch_clamped 15, v1.h[7]
+  vertfetch_clamped 14, v1.h[6]
+  vertfetch_clamped 13, v1.h[5]
+  vertfetch_clamped 12, v1.h[4]
+  vertfetch_clamped 11, v1.h[3]
+  vertfetch_clamped 10, v1.h[2]
+  vertfetch_clamped  9, v1.h[1]
+  vertfetch_clamped  8, v1.h[0]
+  vertfetch_clamped  7, v0.h[7]
+  vertfetch_clamped  6, v0.h[6]
+  vertfetch_clamped  5, v0.h[5]
+  vertfetch_clamped  4, v0.h[4]
+  vertfetch_clamped  3, v0.h[3]
+  vertfetch_clamped  2, v0.h[2]
+  vertfetch_clamped  1, v0.h[1]
+  vertfetch_clamped  0, v0.h[0]
+  1:
+  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelnc at the end of the block.
+   */
+  .align 4
+  vertfetch_noclamp 27, v3.h[3]
+  vertfetch_noclamp 26, v3.h[2]
+  vertfetch_noclamp 25, v3.h[1]
+  vertfetch_noclamp 24, v3.h[0]
+  vertfetch_noclamp 23, v2.h[7]
+  vertfetch_noclamp 22, v2.h[6]
+  vertfetch_noclamp 21, v2.h[5]
+  vertfetch_noclamp 20, v2.h[4]
+  vertfetch_noclamp 19, v2.h[3]
+  vertfetch_noclamp 18, v2.h[2]
+  vertfetch_noclamp 17, v2.h[1]
+  vertfetch_noclamp 16, v2.h[0]
+  vertfetch_noclamp 15, v1.h[7]
+  vertfetch_noclamp 14, v1.h[6]
+  vertfetch_noclamp 13, v1.h[5]
+  vertfetch_noclamp 12, v1.h[4]
+  vertfetch_noclamp 11, v1.h[3]
+  vertfetch_noclamp 10, v1.h[2]
+  vertfetch_noclamp  9, v1.h[1]
+  vertfetch_noclamp  8, v1.h[0]
+  vertfetch_noclamp  7, v0.h[7]
+  vertfetch_noclamp  6, v0.h[6]
+  vertfetch_noclamp  5, v0.h[5]
+  vertfetch_noclamp  4, v0.h[4]
+  vertfetch_noclamp  3, v0.h[3]
+  vertfetch_noclamp  2, v0.h[2]
+  vertfetch_noclamp  1, v0.h[1]
+  vertfetch_noclamp  0, v0.h[0]
+  \labelnc :
+
+  .purgem vertfetch_clamped
+  .purgem vertfetch_noclamp
+
+  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
             add         x15, x15, #16
             uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
             add         x19, x19, #16
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
index 4ab1340..a6479cb 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
@@ -18,6 +18,8 @@
 #define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
 #define END(f) .fnend; .size f, .-f;
 
+#define ARCH_ARM_USE_BLUR_PRELOAD
+
 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
 .arm
 
@@ -40,6 +42,15 @@
 .endif
 .endm
 
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM_USE_BLUR_PRELOAD)
+#define VERTPLD(...) pld [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
 /* Fetch 16 columns of bytes (regardless of image format), convolve these
  * vertically, and leave them in the register file.  If working near the top or
  * bottom of an image then clamp the addressing while loading the data in.
@@ -76,7 +87,7 @@
             mls         r10, r2, r6, r1
 
             vmovl.u8    q14, d30
-            pld         [r1, #32]
+            VERTPLD(r1, #32)
             vmovl.u8    q15, d31
   .if \max_r < 16 // approximate
     ifcc    adr         \reg, 1f
@@ -97,45 +108,132 @@
      ifcc   .align 2
   2: ifcc   .word       1f-1b-8
 
-  .irp rowclamp, 1, 0
-    .set cc, \rowclamp
-    .align 4
-    .irp dreg, 6, 5, 4, 3, 2, 1, 0 ; .irp lane, 3, 2, 1, 0
-      .set i, \dreg * 4 + \lane
-      .if 0 < i && i <= \max_r
-        .if \rowclamp
-            vld1.8      {d20,d21}, [r10]
-            vld1.8      {d22,d23}, [r11]
-            cmp         r6, #i
-        .else
+  /* This version of the vertical fetch loop body is used away from the edges
+   * of the source image.  The pointers start at the top and bottom source rows
+   * and work their way towards the centre on each iteration.  This way the
+   * number of taps used can be controlled by jumping directly into the middle
+   * of the loop and running to completion.
+   * If the loop body changes size then the code which caculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_noclamp i, dreg
+    .if 0 < \i && \i <= \max_r
             vld1.8      {d20,d21}, [r10], r2
             vld1.8      {d22,d23}, [r11]
             sub         r11, r11, r2
-        .endif
             vswp        d21, d22
-            pld         [r10, #32]
+            VERTPLD(r10, #32)
             vaddl.u8    q10, d20, d21
-    ifcc    addhs       r10, r10, r2
             vaddl.u8    q11, d22, d23
-    ifcc    cmp         r7, #i
-            vmlal.u16   q12, d20, d\dreg[\lane]
-            pld         [r11, #32]
-            vmlal.u16   q13, d21, d\dreg[\lane]
-    ifcc    subhs       r11, r11, r2
-            vmlal.u16   q14, d22, d\dreg[\lane]
-    ifcc    nop
-            vmlal.u16   q15, d23, d\dreg[\lane]
-        .endif
-    .endr ; .endr
-    .if \rowclamp == 1
-        1: \labelc :
-            b           2f
-    .else
-        2: \labelnc :
+            vmlal.u16   q12, d20, \dreg
+            VERTPLD(r11, #32)
+            vmlal.u16   q13, d21, \dreg
+            vmlal.u16   q14, d22, \dreg
+            vmlal.u16   q15, d23, \dreg
     .endif
-  .endr
+  .endm
 
-            vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
+  /* This version of the vertical fetch loop body is used near the edges of the
+   * source image, where one or both of the accesses may start with a clamped
+   * value, and the row addresses only begin to change after some number of
+   * iterations before the end.
+   * If the loop body changes size then the code which caculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_clamped i, dreg
+    .if 0 < \i && \i <= \max_r
+            vld1.8      {d20,d21}, [r10]
+            vld1.8      {d22,d23}, [r11]
+            cmp         r6, #\i
+            vswp        d21, d22
+            VERTPLD(r10, #32)
+            vaddl.u8    q10, d20, d21
+            addhs       r10, r10, r2
+            vaddl.u8    q11, d22, d23
+            cmp         r7, #\i
+            vmlal.u16   q12, d20, \dreg
+            VERTPLD(r11, #32)
+            vmlal.u16   q13, d21, \dreg
+            subhs       r11, r11, r2
+            vmlal.u16   q14, d22, \dreg
+            nop
+            vmlal.u16   q15, d23, \dreg
+    .endif
+  .endm
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelc at the end of the block.
+   */
+  .align 4
+  vertfetch_clamped 27, d6[3]
+  vertfetch_clamped 26, d6[2]
+  vertfetch_clamped 25, d6[1]
+  vertfetch_clamped 24, d6[0]
+  vertfetch_clamped 23, d5[3]
+  vertfetch_clamped 22, d5[2]
+  vertfetch_clamped 21, d5[1]
+  vertfetch_clamped 20, d5[0]
+  vertfetch_clamped 19, d4[3]
+  vertfetch_clamped 18, d4[2]
+  vertfetch_clamped 17, d4[1]
+  vertfetch_clamped 16, d4[0]
+  vertfetch_clamped 15, d3[3]
+  vertfetch_clamped 14, d3[2]
+  vertfetch_clamped 13, d3[1]
+  vertfetch_clamped 12, d3[0]
+  vertfetch_clamped 11, d2[3]
+  vertfetch_clamped 10, d2[2]
+  vertfetch_clamped  9, d2[1]
+  vertfetch_clamped  8, d2[0]
+  vertfetch_clamped  7, d1[3]
+  vertfetch_clamped  6, d1[2]
+  vertfetch_clamped  5, d1[1]
+  vertfetch_clamped  4, d1[0]
+  vertfetch_clamped  3, d0[3]
+  vertfetch_clamped  2, d0[2]
+  vertfetch_clamped  1, d0[1]
+  vertfetch_clamped  0, d0[0]
+  1:
+  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelnc at the end of the block.
+   */
+  .align 4
+  vertfetch_noclamp 27, d6[3]
+  vertfetch_noclamp 26, d6[2]
+  vertfetch_noclamp 25, d6[1]
+  vertfetch_noclamp 24, d6[0]
+  vertfetch_noclamp 23, d5[3]
+  vertfetch_noclamp 22, d5[2]
+  vertfetch_noclamp 21, d5[1]
+  vertfetch_noclamp 20, d5[0]
+  vertfetch_noclamp 19, d4[3]
+  vertfetch_noclamp 18, d4[2]
+  vertfetch_noclamp 17, d4[1]
+  vertfetch_noclamp 16, d4[0]
+  vertfetch_noclamp 15, d3[3]
+  vertfetch_noclamp 14, d3[2]
+  vertfetch_noclamp 13, d3[1]
+  vertfetch_noclamp 12, d3[0]
+  vertfetch_noclamp 11, d2[3]
+  vertfetch_noclamp 10, d2[2]
+  vertfetch_noclamp  9, d2[1]
+  vertfetch_noclamp  8, d2[0]
+  vertfetch_noclamp  7, d1[3]
+  vertfetch_noclamp  6, d1[2]
+  vertfetch_noclamp  5, d1[1]
+  vertfetch_noclamp  4, d1[0]
+  vertfetch_noclamp  3, d0[3]
+  vertfetch_noclamp  2, d0[2]
+  vertfetch_noclamp  1, d0[1]
+  vertfetch_noclamp  0, d0[0]
+  \labelnc :
+
+  .purgem vertfetch_clamped
+  .purgem vertfetch_noclamp
+
+  2:        vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
             vqrshrn.u32 d21, q13, #16 - FRACTION_BITS
             vqrshrn.u32 d22, q14, #16 - FRACTION_BITS
             vqrshrn.u32 d23, q15, #16 - FRACTION_BITS
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index f28c946..718d611 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -433,13 +433,40 @@
 //////////////////////////////////////////////////////////////////////////////
 // ForEach routines
 //////////////////////////////////////////////////////////////////////////////
+void rsForEachInternal(int slot,
+                       rs_script_call *call,
+                       int hasOutput,
+                       int numIn,
+                       ...) {
+    Context *rsc = RsdCpuReference::getTlsContext();
+    Script *s = const_cast<Script*>(RsdCpuReference::getTlsScript());
+    if (numIn > 100) {
+        ALOGE("rsForEachInternal: too many inputs to a kernel.");
+        return;
+    }
+    Allocation* inputs[100];
+    Allocation* out = nullptr;
+    va_list argp;
+    va_start(argp, numIn);
+    for (int i = 0; i < numIn; i++) {
+        ::rs_allocation alloc = va_arg(argp, ::rs_allocation);
+        inputs[i] = (Allocation*)alloc.p;
+    }
+    if (hasOutput) {
+        ::rs_allocation outAlloc = va_arg(argp, ::rs_allocation);
+        out = (Allocation*)outAlloc.p;
+    }
+    va_end(argp);
+    rsrForEach(rsc, s, slot, numIn, numIn > 0 ? inputs : nullptr, out, nullptr, 0, (RsScriptCall*)call);
+}
+
 void __attribute__((overloadable)) rsForEach(::rs_script script,
                                              ::rs_allocation in,
                                              ::rs_allocation out,
                                              const void *usr,
                                              const rs_script_call *call) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, (Allocation *)in.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p,
                (Allocation *)out.p, usr, 0, (RsScriptCall *)call);
 }
 
@@ -448,7 +475,7 @@
                                              ::rs_allocation out,
                                              const void *usr) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                usr, 0, nullptr);
 }
 
@@ -456,7 +483,7 @@
                                              ::rs_allocation in,
                                              ::rs_allocation out) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                nullptr, 0, nullptr);
 }
 
@@ -468,7 +495,7 @@
                                              const void *usr,
                                              uint32_t usrLen) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                usr, usrLen, nullptr);
 }
 
@@ -479,7 +506,7 @@
                                              uint32_t usrLen,
                                              const rs_script_call *call) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script *)script.p, (Allocation *)in.p, (Allocation *)out.p,
+    rsrForEach(rsc, (Script *)script.p, 0, 1, (Allocation **)&in.p, (Allocation *)out.p,
                usr, usrLen, (RsScriptCall *)call);
 }
 #endif
diff --git a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
index 3294aed..ecd661e 100644
--- a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
+++ b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
@@ -85,6 +85,7 @@
         unitTests.add(new UT_foreach(this, mRes, mCtx));
         unitTests.add(new UT_foreach_bounds(this, mRes, mCtx));
         unitTests.add(new UT_noroot(this, mRes, mCtx));
+        unitTests.add(new UT_single_source_script(this, mRes, mCtx));
         unitTests.add(new UT_script_group2_pointwise(this, mRes, mCtx));
         unitTests.add(new UT_script_group2_gatherscatter(this, mRes, mCtx));
         unitTests.add(new UT_script_group2_nochain(this, mRes, mCtx));
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_single_source_script.java b/java/tests/RsTest/src/com/android/rs/test/UT_single_source_script.java
new file mode 100644
index 0000000..5765d4e
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_single_source_script.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.test;
+
+import android.content.Context;
+import android.content.res.Resources;
+import android.renderscript.*;
+
+public class UT_single_source_script extends UnitTest {
+    private Resources mRes;
+    private Allocation testAllocation1, testAllocation2;
+
+    protected UT_single_source_script(RSTestCore rstc, Resources res, Context ctx) {
+        super(rstc, "SingleSourceScript", ctx);
+        mRes = res;
+    }
+
+    private void initializeGlobals(RenderScript RS, ScriptC_single_source_script s) {
+        Type.Builder i32TypeBuilder = new Type.Builder(RS, Element.I32(RS));
+        int X = 1024;
+        int Y = 768;
+        s.set_dimX(X);
+        s.set_dimY(Y);
+        i32TypeBuilder.setX(X).setY(Y);
+        testAllocation1 = Allocation.createTyped(RS, i32TypeBuilder.create());
+        testAllocation2 = Allocation.createTyped(RS, i32TypeBuilder.create());
+    }
+
+    public void run() {
+        RenderScript pRS = RenderScript.create(mCtx);
+        ScriptC_single_source_script s = new ScriptC_single_source_script(pRS);
+        pRS.setMessageHandler(mRsMessage);
+        initializeGlobals(pRS, s);
+
+        s.invoke_entrypoint(testAllocation1, testAllocation2);
+
+        pRS.finish();
+        waitForMessage();
+        pRS.destroy();
+    }
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/single_source_script.rs b/java/tests/RsTest/src/com/android/rs/test/single_source_script.rs
new file mode 100644
index 0000000..e34dd5b
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/single_source_script.rs
@@ -0,0 +1,70 @@
+#include "shared.rsh"
+
+int dimX;
+int dimY;
+
+int __attribute__((kernel)) foo(int a) {
+    return a * 2;
+}
+
+int __attribute__((kernel)) goo(int a, int b) {
+    return a + b;
+}
+
+static void validate(rs_allocation out) {
+    bool failed = false;
+
+    int i, j;
+
+    for (j = 0; j < dimY; j++) {
+        for (i = 0; i < dimX; i++) {
+            const int actual = rsGetElementAt_int(out, i, j);
+            int expected = (i + j * dimX) * 4;
+            if (j < dimY / 2) {
+                expected *= 2;
+            }
+            expected += (i + j * dimX);
+            if (actual != expected) {
+                failed = true;
+                rsDebug("row     ", j);
+                rsDebug("column  ", i);
+                rsDebug("expects ", expected);
+                rsDebug("got     ", actual);
+            }
+        }
+    }
+
+    if (failed) {
+        rsDebug("FAILED", 0);
+    } else {
+        rsDebug("PASSED", 0);
+    }
+
+    if (failed) {
+        rsSendToClientBlocking(RS_MSG_TEST_FAILED);
+    } else {
+        rsSendToClientBlocking(RS_MSG_TEST_PASSED);
+    }
+}
+
+void entrypoint(rs_allocation in, rs_allocation out) {
+    int i, j;
+    for (i = 0; i < dimX; i++) {
+        for (j = 0; j < dimY; j++) {
+            rsSetElementAt_int(in, j * dimX + i, i, j);
+        }
+    }
+
+    rsForEach(foo, in, out);
+    rsForEach(foo, out, out);
+    rs_script_call_t opts = {0};
+    opts.xStart = 0;
+    opts.xEnd = dimX;
+    opts.yStart = 0;
+    opts.yEnd = dimY / 2;
+    rsForEachWithOptions(foo, &opts, out, out);
+
+    rsForEach(goo, in, out, out);
+
+    validate(out);
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/test_root.rs b/java/tests/RsTest/src/com/android/rs/test/test_root.rs
deleted file mode 100644
index 6dc83ba..0000000
--- a/java/tests/RsTest/src/com/android/rs/test/test_root.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-// Fountain test script
-#pragma version(1)
-
-#pragma rs java_package_name(com.android.rs.test)
-
-#pragma stateFragment(parent)
-
-#include "rs_graphics.rsh"
-
-
-typedef struct TestResult {
-    rs_allocation name;
-    bool pass;
-    float score;
-} TestResult_t;
-TestResult_t *results;
-
-int root() {
-
-    return 0;
-}
-
-
diff --git a/rsRuntime.h b/rsRuntime.h
index 5a05883..9bc05b3 100644
--- a/rsRuntime.h
+++ b/rsRuntime.h
@@ -155,7 +155,9 @@
 
 
 void rsrForEach(Context *, Script *target,
-                Allocation *in,
+                uint32_t slot,
+                uint32_t numInputs,
+                Allocation **in,
                 Allocation *out,
                 const void *usr,
                 uint32_t usrBytes,
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index c404bde..a411e34 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -236,20 +236,12 @@
 
 void rsrForEach(Context *rsc,
                 Script *target,
-                Allocation *in, Allocation *out,
+                uint32_t slot,
+                uint32_t numInputs,
+                Allocation **in, Allocation *out,
                 const void *usr, uint32_t usrBytes,
                 const RsScriptCall *call) {
-
-    if (in == nullptr) {
-        target->runForEach(rsc, /* root slot */ 0, nullptr, 0, out, usr,
-                           usrBytes, call);
-
-    } else {
-        const Allocation *ins[1] = {in};
-        target->runForEach(rsc, /* root slot */ 0, ins,
-                           sizeof(ins) / sizeof(RsAllocation), out, usr,
-                           usrBytes, call);
-    }
+    target->runForEach(rsc, slot, (const Allocation**)in, numInputs, out, usr, usrBytes, call);
 }
 
 void rsrAllocationSyncAll(Context *rsc, Allocation *a, RsAllocationUsageType usage) {
diff --git a/scriptc/rs_for_each.rsh b/scriptc/rs_for_each.rsh
index 9771d09..6a42b41 100644
--- a/scriptc/rs_for_each.rsh
+++ b/scriptc/rs_for_each.rsh
@@ -91,6 +91,16 @@
 } rs_script_call_t;
 
 /*
+ * rs_kernel: Handle to a kernel function
+ *
+ *  An opaque type for a function that is defined with the kernel attribute.  A value
+ *  of this type can be used in a rsForEach call to launch a kernel.
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 4294967295) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 4294967295))
+typedef void* rs_kernel;
+#endif
+
+/*
  * rsForEach: Invoke the root kernel of a script
  *
  * Invoke the kernel named "root" of the specified script.  Like other kernels, this root()
@@ -119,6 +129,8 @@
  *   usrData: User defined data to pass to the script.  May be NULL.
  *   sc: Extra control information used to select a sub-region of the allocation to be processed or suggest a walking strategy.  May be NULL.
  *   usrDataLen: Size of the userData structure.  This will be used to perform a shallow copy of the data if necessary.
+ *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   ...: Input and output allocations
  */
 #if !defined(RS_VERSION) || (RS_VERSION <= 13)
 extern void __attribute__((overloadable))
@@ -143,11 +155,31 @@
               size_t usrDataLen);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 14))
+#if (defined(RS_VERSION) && (RS_VERSION >= 14) && (RS_VERSION <= 23))
 extern void __attribute__((overloadable))
     rsForEach(rs_script script, rs_allocation input, rs_allocation output);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 4294967295) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 4294967295))
+extern void
+    rsForEach(rs_kernel kernel,  ...);
+#endif
+
+/*
+ * rsForEachWithOptions: TBD
+ *
+ *  TBD
+ *
+ * Parameters:
+ *   kernel: Function designator to a function that is defined with the kernel attribute.
+ *   options: Launch options
+ *   ...: Input and output allocations
+ */
+#if (defined(RS_VERSION) && (RS_VERSION >= 4294967295) && (defined(RS_DECLARE_EXPIRED_APIS) || RS_VERSION <= 4294967295))
+extern void
+    rsForEachWithOptions(rs_kernel kernel, rs_script_call_t* options,  ...);
+#endif
+
 /*
  * rsGetArray0: Index in the Array0 dimension for the specified kernel context
  *