test_conformance/conversions/test_conversions.cpp - platform/external/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "harness/compat.h"
 #include "harness/rounding_mode.h"
 #include "harness/ThreadPool.h"
 #include "harness/testHarness.h"
 #include "harness/kernelHelpers.h"
 #include "harness/parseParameters.h"
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
 #endif

 #if defined(__linux__)
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/sysctl.h>
 #endif
 #if defined(__linux__)
 #include <sys/param.h>
 #include <libgen.h>
 #endif

 #include "mingw_compat.h"
 #if defined(__MINGW32__)
 #include <sys/param.h>
 #endif

 #include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
 #if !defined(_WIN32)
 #include <libgen.h>
 #include <sys/mman.h>
 #endif
 #include <time.h>

 #include <algorithm>

 #include "Sleep.h"
 #include "basic_test_conversions.h"

 #if (defined(_WIN32) && defined(_MSC_VER))
 // need for _controlfp_s and rouinding modes in RoundingMode
 #include "harness/testHarness.h"
 #endif

 #pragma mark -
 #pragma mark globals

 #define BUFFER_SIZE (1024 * 1024)
 #define kPageSize 4096
 #define EMBEDDED_REDUCTION_FACTOR 16
 #define PERF_LOOP_COUNT 100

 #define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)

 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
 #include "fplib.h"
 extern bool qcom_sat;
 extern roundingMode qcom_rm;
 #endif

 const char **argList = NULL;
 int argCount = 0;
 cl_context gContext = NULL;
 cl_command_queue gQueue = NULL;
 char appName[64] = "ctest";
 int gStartTestNumber = -1;
 int gEndTestNumber = 0;
 #if defined(__APPLE__)
 int gTimeResults = 1;
 #else
 int gTimeResults = 0;
 #endif
 int gReportAverageTimes = 0;
 void *gIn = NULL;
 void *gRef = NULL;
 void *gAllowZ = NULL;
 void *gOut[kCallStyleCount] = { NULL };
 cl_mem gInBuffer;
 cl_mem gOutBuffers[kCallStyleCount];
 size_t gComputeDevices = 0;
 uint32_t gDeviceFrequency = 0;
 int gWimpyMode = 0;
 int gWimpyReductionFactor = 128;
 int gSkipTesting = 0;
 int gForceFTZ = 0;
 int gMultithread = 1;
 int gIsRTZ = 0;
 uint32_t gSimdSize = 1;
 int gHasDouble = 0;
 int gTestDouble = 1;
 const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
 const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
 int gMinVectorSize = 0;
 int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
 static MTdata gMTdata;

 #pragma mark -
 #pragma mark Declarations

 static int ParseArgs(int argc, const char **argv);
 static void PrintUsage(void);
 test_status InitCL(cl_device_id device);
 static int GetTestCase(const char *name, Type *outType, Type *inType,
                        SaturationMode *sat, RoundingMode *round);
 static int DoTest(cl_device_id device, Type outType, Type inType,
                   SaturationMode sat, RoundingMode round, MTdata d);
 static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
                               RoundingMode round, int vectorSize,
                               cl_kernel *outKernel);
 static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
                      size_t blockCount);

 void *FlushToZero(void);
 void UnFlushToZero(void *);

 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
 // convert long and ulong to float and double or otherwise deal with values
 // that need more precision than 53-bit. So, set the x87 to 64-bit precision.
 static inline void Force64BitFPUPrecision(void)
 {
 #if __MINGW32__
     // The usual method is to use _controlfp as follows:
     //     #include <float.h>
     //     _controlfp(_PC_64, _MCW_PC);
     //
     // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
     // divergent code just use inline assembly which works for both.
     unsigned short int orig_cw = 0;
     unsigned short int new_cw = 0;
     __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
     new_cw = orig_cw | 0x0300; // set precision to 64-bit
     __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
 #else
     /* Implement for other platforms if needed */
 #endif
 }

 int test_conversions(cl_device_id device, cl_context context,
                      cl_command_queue queue, int num_elements)
 {
     int error, i, testNumber = -1;
     int startMinVectorSize = gMinVectorSize;
     Type inType, outType;
     RoundingMode round;
     SaturationMode sat;

     if (argCount)
     {
         for (i = 0; i < argCount; i++)
         {
             if (GetTestCase(argList[i], &outType, &inType, &sat, &round))
             {
                 vlog_error("\n\t\t**** ERROR:  Unable to parse function name "
                            "%s.  Skipping....  *****\n\n",
                            argList[i]);
                 continue;
             }

             // skip double if we don't have it
             if (!gTestDouble && (inType == kdouble || outType == kdouble))
             {
                 if (gHasDouble)
                 {
                     vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
                                gTypeNames[outType], gSaturationNames[sat],
                                gRoundingModeNames[round], gTypeNames[inType]);
                     vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
                          "off.\n");
                 }

                 continue;
             }

             // skip longs on embedded
             if (!gHasLong
                 && (inType == klong || outType == klong || inType == kulong
                     || outType == kulong))
             {
                 continue;
             }

             // Skip the implicit converts if the rounding mode is not default or
             // test is saturated
             if (0 == startMinVectorSize)
             {
                 if (sat || round != kDefaultRoundingMode)
                     gMinVectorSize = 1;
                 else
                     gMinVectorSize = 0;
             }

             if ((error = DoTest(device, outType, inType, sat, round, gMTdata)))
             {
                 vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
                            gTypeNames[outType], gSaturationNames[sat],
                            gRoundingModeNames[round], gTypeNames[inType]);
             }
         }
     }
     else
     {
         for (outType = (Type)0; outType < kTypeCount;
              outType = (Type)(outType + 1))
         {
             for (inType = (Type)0; inType < kTypeCount;
                  inType = (Type)(inType + 1))
             {
                 // skip longs on embedded
                 if (!gHasLong
                     && (inType == klong || outType == klong || inType == kulong
                         || outType == kulong))
                 {
                     continue;
                 }

                 for (sat = (SaturationMode)0; sat < kSaturationModeCount;
                      sat = (SaturationMode)(sat + 1))
                 {
                     // skip illegal saturated conversions to float type
                     if (kSaturated == sat
                         && (outType == kfloat || outType == kdouble))
                     {
                         continue;
                     }

                     for (round = (RoundingMode)0; round < kRoundingModeCount;
                          round = (RoundingMode)(round + 1))
                     {
                         if (++testNumber < gStartTestNumber)
                         {
                             //     vlog( "%d) skipping convert_%sn%s%s( %sn
                             //     )\n", testNumber, gTypeNames[ outType ],
                             //     gSaturationNames[ sat ],
                             //     gRoundingModeNames[round], gTypeNames[inType]
                             //     );
                             continue;
                         }
                         else
                         {
                             if (gEndTestNumber > 0
                                 && testNumber >= gEndTestNumber)
                             {
                                 goto exit;
                             }
                         }

                         vlog("%d) Testing convert_%sn%s%s( %sn ):\n",
                              testNumber, gTypeNames[outType],
                              gSaturationNames[sat], gRoundingModeNames[round],
                              gTypeNames[inType]);

                         // skip double if we don't have it
                         if (!gTestDouble
                             && (inType == kdouble || outType == kdouble))
                         {
                             if (gHasDouble)
                             {
                                 vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
                                            "FAILED ** \n",
                                            testNumber, gTypeNames[outType],
                                            gSaturationNames[sat],
                                            gRoundingModeNames[round],
                                            gTypeNames[inType]);
                                 vlog("\t\tcl_khr_fp64 enabled, but double "
                                      "testing turned off.\n");
                             }
                             continue;
                         }

                         // Skip the implicit converts if the rounding mode is
                         // not default or test is saturated
                         if (0 == startMinVectorSize)
                         {
                             if (sat || round != kDefaultRoundingMode)
                                 gMinVectorSize = 1;
                             else
                                 gMinVectorSize = 0;
                         }

                         if ((error = DoTest(device, outType, inType, sat, round,
                                             gMTdata)))
                         {
                             vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
                                        "FAILED ** \n",
                                        testNumber, gTypeNames[outType],
                                        gSaturationNames[sat],
                                        gRoundingModeNames[round],
                                        gTypeNames[inType]);
                         }
                     }
                 }
             }
         }
     }

 exit:
     return gFailCount;
 }

 test_definition test_list[] = {
     ADD_TEST(conversions),
 };

 const int test_num = ARRAY_SIZE(test_list);

 #pragma mark -

 int main(int argc, const char **argv)
 {
     int error;
     cl_uint seed = (cl_uint)time(NULL);

     argc = parseCustomParam(argc, argv);
     if (argc == -1)
     {
         return 1;
     }

     if ((error = ParseArgs(argc, argv))) return error;

     // Turn off sleep so our tests run to completion
     PreventSleep();
     atexit(ResumeSleep);

     if (!gMultithread) SetThreadCount(1);

 #if defined(_MSC_VER) && defined(_M_IX86)
     // VS2005 (and probably others, since long double got deprecated) sets
     // the x87 to 53-bit precision. This causes problems with the tests
     // that convert long and ulong to float and double, since they deal
     // with values that need more precision than that. So, set the x87
     // to 64-bit precision.
     unsigned int ignored;
     _controlfp_s(&ignored, _PC_64, _MCW_PC);
 #endif

     vlog("===========================================================\n");
     vlog("Random seed: %u\n", seed);
     gMTdata = init_genrand(seed);

     const char *arg[] = { argv[0] };
     int ret =
         runTestHarnessWithCheck(1, arg, test_num, test_list, true, 0, InitCL);

     free_mtdata(gMTdata);
     if (gQueue)
     {
         error = clFinish(gQueue);
         if (error) vlog_error("clFinish failed: %d\n", error);
     }

     clReleaseMemObject(gInBuffer);

     for (int i = 0; i < kCallStyleCount; i++)
     {
         clReleaseMemObject(gOutBuffers[i]);
     }
     clReleaseCommandQueue(gQueue);
     clReleaseContext(gContext);

     return ret;
 }

 #pragma mark -
 #pragma mark setup

 static int ParseArgs(int argc, const char **argv)
 {
     int i;
     argList = (const char **)calloc(argc, sizeof(char *));
     argCount = 0;

     if (NULL == argList && argc > 1) return -1;

 #if (defined(__APPLE__) || defined(__linux__) || defined(__MINGW32__))
     { // Extract the app name
         char baseName[MAXPATHLEN];
         strncpy(baseName, argv[0], MAXPATHLEN);
         char *base = basename(baseName);
         if (NULL != base)
         {
             strncpy(appName, base, sizeof(appName));
             appName[sizeof(appName) - 1] = '\0';
         }
     }
 #elif defined(_WIN32)
     {
         char fname[_MAX_FNAME + _MAX_EXT + 1];
         char ext[_MAX_EXT];

         errno_t err = _splitpath_s(argv[0], NULL, 0, NULL, 0, fname, _MAX_FNAME,
                                    ext, _MAX_EXT);
         if (err == 0)
         { // no error
             strcat(fname, ext); // just cat them, size of frame can keep both
             strncpy(appName, fname, sizeof(appName));
             appName[sizeof(appName) - 1] = '\0';
         }
     }
 #endif

     vlog("\n%s", appName);
     for (i = 1; i < argc; i++)
     {
         const char *arg = argv[i];
         if (NULL == arg) break;

         vlog("\t%s", arg);
         if (arg[0] == '-')
         {
             arg++;
             while (*arg != '\0')
             {
                 switch (*arg)
                 {
                     case 'd': gTestDouble ^= 1; break;
                     case 'l': gSkipTesting ^= 1; break;
                     case 'm': gMultithread ^= 1; break;
                     case 'w': gWimpyMode ^= 1; break;
                     case '[':
                         parseWimpyReductionFactor(arg, gWimpyReductionFactor);
                         break;
                     case 'z': gForceFTZ ^= 1; break;
                     case 't': gTimeResults ^= 1; break;
                     case 'a': gReportAverageTimes ^= 1; break;
                     case '1':
                         if (arg[1] == '6')
                         {
                             gMinVectorSize = 6;
                             gMaxVectorSize = 7;
                             arg++;
                         }
                         else
                         {
                             gMinVectorSize = 0;
                             gMaxVectorSize = 2;
                         }
                         break;

                     case '2':
                         gMinVectorSize = 2;
                         gMaxVectorSize = 3;
                         break;

                     case '3':
                         gMinVectorSize = 3;
                         gMaxVectorSize = 4;
                         break;

                     case '4':
                         gMinVectorSize = 4;
                         gMaxVectorSize = 5;
                         break;

                     case '8':
                         gMinVectorSize = 5;
                         gMaxVectorSize = 6;
                         break;

                     default:
                         vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg);
                         PrintUsage();
                         return -1;
                 }
                 arg++;
             }
         }
         else
         {
             char *t = NULL;
             long number = strtol(arg, &t, 0);
             if (t != arg)
             {
                 if (gStartTestNumber != -1)
                     gEndTestNumber = gStartTestNumber + (int)number;
                 else
                     gStartTestNumber = (int)number;
             }
             else
             {
                 argList[argCount] = arg;
                 argCount++;
             }
         }
     }

     // Check for the wimpy mode environment variable
     if (getenv("CL_WIMPY_MODE"))
     {
         vlog("\n");
         vlog("*** Detected CL_WIMPY_MODE env                          ***\n");
         gWimpyMode = 1;
     }

     vlog( "\n" );

     PrintArch();

     if (gWimpyMode)
     {
         vlog("\n");
         vlog("*** WARNING: Testing in Wimpy mode!                     ***\n");
         vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n");
         vlog("*** It gives warm fuzzy feelings and then nevers calls. ***\n\n");
         vlog("*** Wimpy Reduction Factor: %-27u ***\n\n",
              gWimpyReductionFactor);
     }

     return 0;
 }

 static void PrintUsage(void)
 {
     int i;
     vlog("%s [-wz#]: <optional: test names>\n", appName);
     vlog("\ttest names:\n");
     vlog("\t\tdestFormat<_sat><_round>_sourceFormat\n");
     vlog("\t\t\tPossible format types are:\n\t\t\t\t");
     for (i = 0; i < kTypeCount; i++) vlog("%s, ", gTypeNames[i]);
     vlog("\n\n\t\t\tPossible saturation values are: (empty) and _sat\n");
     vlog("\t\t\tPossible rounding values are:\n\t\t\t\t(empty), ");
     for (i = 1; i < kRoundingModeCount; i++)
         vlog("%s, ", gRoundingModeNames[i]);
     vlog("\n\t\t\tExamples:\n");
     vlog("\t\t\t\tulong_short   converts short to ulong\n");
     vlog("\t\t\t\tchar_sat_rte_float   converts float to char with saturated "
          "clipping in round to nearest rounding mode\n\n");
     vlog("\toptions:\n");
     vlog("\t\t-d\tToggle testing of double precision.  On by default if "
          "cl_khr_fp64 is enabled, ignored otherwise.\n");
     vlog("\t\t-l\tToggle link check mode. When on, testing is skipped, and we "
          "just check to see that the kernels build. (Off by default.)\n");
     vlog("\t\t-m\tToggle Multithreading. (On by default.)\n");
     vlog("\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very "
          "small subset of the tests for each fn. NOT A VALID TEST! (Off by "
          "default.)\n");
     vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
          "1-12, default factor(%u)\n",
          gWimpyReductionFactor);
     vlog("\t\t-z\tToggle flush to zero mode  (Default: per device)\n");
     vlog("\t\t-#\tTest just vector size given by #, where # is an element of "
          "the set {1,2,3,4,8,16}\n");
     vlog("\n");
     vlog(
         "You may also pass the number of the test on which to start.\nA second "
         "number can be then passed to indicate how many tests to run\n\n");
 }


 static int GetTestCase(const char *name, Type *outType, Type *inType,
                        SaturationMode *sat, RoundingMode *round)
 {
     int i;

     // Find the return type
     for (i = 0; i < kTypeCount; i++)
         if (name == strstr(name, gTypeNames[i]))
         {
             *outType = (Type)i;
             name += strlen(gTypeNames[i]);

             break;
         }

     if (i == kTypeCount) return -1;

     // Check to see if _sat appears next
     *sat = (SaturationMode)0;
     for (i = 1; i < kSaturationModeCount; i++)
         if (name == strstr(name, gSaturationNames[i]))
         {
             *sat = (SaturationMode)i;
             name += strlen(gSaturationNames[i]);
             break;
         }

     *round = (RoundingMode)0;
     for (i = 1; i < kRoundingModeCount; i++)
         if (name == strstr(name, gRoundingModeNames[i]))
         {
             *round = (RoundingMode)i;
             name += strlen(gRoundingModeNames[i]);
             break;
         }

     if (*name != '_') return -2;
     name++;

     for (i = 0; i < kTypeCount; i++)
         if (name == strstr(name, gTypeNames[i]))
         {
             *inType = (Type)i;
             name += strlen(gTypeNames[i]);

             break;
         }

     if (i == kTypeCount) return -3;

     if (*name != '\0') return -4;

     return 0;
 }

 #pragma mark -
 #pragma mark OpenCL

 test_status InitCL(cl_device_id device)
 {
     int error, i;
     size_t configSize = sizeof(gComputeDevices);

     if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
                                  configSize, &gComputeDevices, NULL)))
         gComputeDevices = 1;

     configSize = sizeof(gDeviceFrequency);
     if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
                                  configSize, &gDeviceFrequency, NULL)))
         gDeviceFrequency = 0;

     cl_device_fp_config floatCapabilities = 0;
     if ((error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG,
                                  sizeof(floatCapabilities), &floatCapabilities,
                                  NULL)))
         floatCapabilities = 0;
     if (0 == (CL_FP_DENORM & floatCapabilities)) gForceFTZ ^= 1;

     if (0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST))
     {
         char profileStr[128] = "";
         // Verify that we are an embedded profile device
         if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE,
                                      sizeof(profileStr), profileStr, NULL)))
         {
             vlog_error("FAILURE: Could not get device profile: error %d\n",
                        error);
             return TEST_FAIL;
         }

         if (strcmp(profileStr, "EMBEDDED_PROFILE"))
         {
             vlog_error("FAILURE: non-embedded profile device does not support "
                        "CL_FP_ROUND_TO_NEAREST\n");
             return TEST_FAIL;
         }

         if (0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO))
         {
             vlog_error("FAILURE: embedded profile device supports neither "
                        "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n");
             return TEST_FAIL;
         }

         gIsRTZ = 1;
     }

     else if (is_extension_available(device, "cl_khr_fp64"))
     {
         gHasDouble = 1;
     }
     gTestDouble &= gHasDouble;

     gContext = clCreateContext(NULL, 1, &device, notify_callback, NULL, &error);
     if (NULL == gContext || error)
     {
         vlog_error("clCreateContext failed. (%d)\n", error);
         return TEST_FAIL;
     }

     gQueue = clCreateCommandQueue(gContext, device, 0, &error);
     if (NULL == gQueue || error)
     {
         vlog_error("clCreateCommandQueue failed. (%d)\n", error);
         return TEST_FAIL;
     }

     // Allocate buffers
     // FIXME: use clProtectedArray for guarded allocations?
     gIn = malloc(BUFFER_SIZE + 2 * kPageSize);
     gAllowZ = malloc(BUFFER_SIZE + 2 * kPageSize);
     gRef = malloc(BUFFER_SIZE + 2 * kPageSize);
     for (i = 0; i < kCallStyleCount; i++)
     {
         gOut[i] = malloc(BUFFER_SIZE + 2 * kPageSize);
         if (NULL == gOut[i]) return TEST_FAIL;
     }

     // setup input buffers
     gInBuffer =
         clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                        BUFFER_SIZE, NULL, &error);
     if (gInBuffer == NULL || error)
     {
         vlog_error("clCreateBuffer failed for input (%d)\n", error);
         return TEST_FAIL;
     }

     // setup output buffers
     for (i = 0; i < kCallStyleCount; i++)
     {
         gOutBuffers[i] =
             clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                            BUFFER_SIZE, NULL, &error);
         if (gOutBuffers[i] == NULL || error)
         {
             vlog_error("clCreateArray failed for output (%d)\n", error);
             return TEST_FAIL;
         }
     }


     gMTdata = init_genrand(gRandomSeed);


     char c[1024];
     static const char *no_yes[] = { "NO", "YES" };
     vlog("\nCompute Device info:\n");
     clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL);
     vlog("\tDevice Name: %s\n", c);
     clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
     vlog("\tVendor: %s\n", c);
     clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL);
     vlog("\tDevice Version: %s\n", c);
     clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
     vlog("\tCL C Version: %s\n", c);
     clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL);
     vlog("\tDriver Version: %s\n", c);
     vlog("\tProcessing with %ld devices\n", gComputeDevices);
     vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency);
     vlog("\tSubnormal values supported for floats? %s\n",
          no_yes[0 != (CL_FP_DENORM & floatCapabilities)]);
     vlog("\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ]);
     vlog("\tTesting with default RTZ mode for floats? %s\n",
          no_yes[0 != gIsRTZ]);
     vlog("\tHas Double? %s\n", no_yes[0 != gHasDouble]);
     if (gHasDouble) vlog("\tTest Double? %s\n", no_yes[0 != gTestDouble]);
     vlog("\tHas Long? %s\n", no_yes[0 != gHasLong]);
     vlog("\tTesting vector sizes: ");
     for (i = gMinVectorSize; i < gMaxVectorSize; i++)
         vlog("\t%d", vectorSizes[i]);
     vlog("\n");
     return TEST_PASS;
 }

 static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
                      size_t blockCount)
 {
     // The global dimensions are just the blockCount to execute since we haven't
     // set up multiple queues for multiple devices.
     int error;

     error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
     error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);

     if (error)
     {
         vlog_error("FAILED -- could not set kernel args (%d)\n", error);
         return error;
     }

     if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
                                         NULL, 0, NULL, NULL)))
     {
         vlog_error("FAILED -- could not execute kernel (%d)\n", error);
         return error;
     }

     return 0;
 }

 #if defined(__APPLE__)
 #include <mach/mach_time.h>
 #endif

 uint64_t GetTime(void);
 uint64_t GetTime(void)
 {
 #if defined(__APPLE__)
     return mach_absolute_time();
 #elif defined(_MSC_VER)
     return ReadTime();
 #else
     // mach_absolute_time is a high precision timer with precision < 1
     // microsecond.
 #warning need accurate clock here.  Times are invalid.
     return 0;
 #endif
 }


 #if defined(_MSC_VER)
 /* function is defined in "compat.h" */
 #else
 double SubtractTime(uint64_t endTime, uint64_t startTime);
 double SubtractTime(uint64_t endTime, uint64_t startTime)
 {
     uint64_t diff = endTime - startTime;
     static double conversion = 0.0;

     if (0.0 == conversion)
     {
 #if defined(__APPLE__)
         mach_timebase_info_data_t info = { 0, 0 };
         kern_return_t err = mach_timebase_info(&info);
         if (0 == err)
             conversion = 1e-9 * (double)info.numer / (double)info.denom;
 #else
         // This function consumes output from GetTime() above, and converts the
         // time to secionds.
 #warning need accurate ticks to seconds conversion factor here. Times are invalid.
 #endif
     }

     // strictly speaking we should also be subtracting out timer latency here
     return conversion * (double)diff;
 }
 #endif

 typedef struct CalcReferenceValuesInfo
 {
     struct WriteInputBufferInfo
         *parent; // pointer back to the parent WriteInputBufferInfo struct
     cl_kernel kernel; // the kernel for this vector size
     cl_program program; // the program for this vector size
     cl_uint vectorSize; // the vector size for this callback chain
     void *p; // the pointer to mapped result data for this vector size
     cl_int result;
 } CalcReferenceValuesInfo;

 typedef struct WriteInputBufferInfo
 {
     volatile cl_event
         calcReferenceValues; // user event which signals when main thread is
                              // done calculating reference values
     volatile cl_event
         doneBarrier; // user event which signals when worker threads are done
     cl_uint count; // the number of elements in the array
     Type outType; // the data type of the conversion result
     Type inType; // the data type of the conversion input
     volatile int barrierCount;
     CalcReferenceValuesInfo calcInfo[kCallStyleCount];
 } WriteInputBufferInfo;

 cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
 cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
     if (0 == (x & (x - 1))) return x;

     while (x & (x - 1)) x &= x - 1;

     return x + x;
 }

 void WriteInputBufferComplete(void *);

 typedef struct DataInitInfo
 {
     cl_ulong start;
     cl_uint size;
     Type outType;
     Type inType;
     SaturationMode sat;
     RoundingMode round;
     MTdata *d;
 } DataInitInfo;

 cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
 cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
 {
     DataInitInfo *info = (DataInitInfo *)p;

     gInitFunctions[info->inType](
         (char *)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat,
         info->round, info->outType, info->start + job_id * info->size,
         info->size, info->d[thread_id]);
     return CL_SUCCESS;
 }

 static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
 {
     cl_uint i;
     for (i = 0; i < count; ++i)
         allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
 }

 cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
 cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
 {
     DataInitInfo *info = (DataInitInfo *)p;
     cl_uint count = info->size;
     Type inType = info->inType;
     Type outType = info->outType;
     RoundingMode round = info->round;
     size_t j;

     Force64BitFPUPrecision();

     void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
     void *a = (cl_uchar *)gAllowZ + job_id * count;
     void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];

     if (outType != inType)
     {
         // create the reference while we wait
         Convert f = gConversions[outType][inType];
         if (info->sat) f = gSaturatedConversions[outType][inType];

 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
         /* ARM VFP doesn't have hardware instruction for converting from 64-bit
          * integer to float types, hence GCC ARM uses the floating-point
          * emulation code despite which -mfloat-abi setting it is. But the
          * emulation code in libgcc.a has only one rounding mode (round to
          * nearest even in this case) and ignores the user rounding mode setting
          * in hardware. As a result setting rounding modes in hardware won't
          * give correct rounding results for type covert from 64-bit integer to
          * float using GCC for ARM compiler so for testing different rounding
          * modes, we need to use alternative reference function. ARM64 does have
          * an instruction, however we cannot guarantee the compiler will use it.
          * On all ARM architechures use emulation to calculate reference.*/
         switch (round)
         {
             /* conversions to floating-point type use the current rounding mode.
              * The only default floating-point rounding mode supported is round
              * to nearest even i.e the current rounding mode will be _rte for
              * floating-point types. */
             case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
             case kRoundToNearestEven: qcom_rm = qcomRTE; break;
             case kRoundUp: qcom_rm = qcomRTP; break;
             case kRoundDown: qcom_rm = qcomRTN; break;
             case kRoundTowardZero: qcom_rm = qcomRTZ; break;
             default:
                 vlog_error("ERROR: undefined rounding mode %d\n", round);
                 break;
         }
         qcom_sat = info->sat;
 #endif

         RoundingMode oldRound = set_round(round, outType);
         f(d, s, count);
         set_round(oldRound, outType);

         // Decide if we allow a zero result in addition to the correctly rounded
         // one
         memset(a, 0, count);
         if (gForceFTZ)
         {
             if (inType == kfloat) setAllowZ((uint8_t *)a, (uint32_t *)s, count);
             if (outType == kfloat)
                 setAllowZ((uint8_t *)a, (uint32_t *)d, count);
         }
     }
     else
     {
         // Copy the input to the reference
         memcpy(d, s, info->size * gTypeSizes[inType]);
     }

     // Patch up NaNs conversions to integer to zero -- these can be converted to
     // any integer
     if (info->outType != kfloat && info->outType != kdouble)
     {
         if (inType == kfloat)
         {
             float *inp = (float *)s;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j]))
                     memset((char *)d + j * gTypeSizes[outType], 0,
                            gTypeSizes[outType]);
             }
         }
         if (inType == kdouble)
         {
             double *inp = (double *)s;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j]))
                     memset((char *)d + j * gTypeSizes[outType], 0,
                            gTypeSizes[outType]);
             }
         }
     }
     else if (inType == kfloat || inType == kdouble)
     { // outtype and intype is float or double.  NaN conversions for float <->
       // double can be any NaN
         if (inType == kfloat && outType == kdouble)
         {
             float *inp = (float *)s;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j])) ((double *)d)[j] = NAN;
             }
         }
         if (inType == kdouble && outType == kfloat)
         {
             double *inp = (double *)s;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j])) ((float *)d)[j] = NAN;
             }
         }
     }

     return CL_SUCCESS;
 }

 static int DoTest(cl_device_id device, Type outType, Type inType,
                   SaturationMode sat, RoundingMode round, MTdata d)
 {
 #ifdef __APPLE__
     cl_ulong wall_start = mach_absolute_time();
 #endif

     DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
     WriteInputBufferInfo writeInputBufferInfo;
     int vectorSize;
     int error = 0;
     cl_uint threads = GetThreadCount();
     uint64_t i;

     gTestCount++;
     size_t blockCount =
         BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
     size_t step = blockCount;
     uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);

     memset(&writeInputBufferInfo, 0, sizeof(writeInputBufferInfo));
     init_info.d = (MTdata *)malloc(threads * sizeof(MTdata));
     if (NULL == init_info.d)
     {
         vlog_error(
             "ERROR: Unable to allocate storage for random number generator!\n");
         return -1;
     }
     for (i = 0; i < threads; i++)
     {
         init_info.d[i] = init_genrand(genrand_int32(d));
         if (NULL == init_info.d[i])
         {
             vlog_error("ERROR: Unable to allocate storage for random number "
                        "generator!\n");
             return -1;
         }
     }

     writeInputBufferInfo.outType = outType;
     writeInputBufferInfo.inType = inType;

     for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
         writeInputBufferInfo.calcInfo[vectorSize].program =
             MakeProgram(outType, inType, sat, round, vectorSize,
                         &writeInputBufferInfo.calcInfo[vectorSize].kernel);
         if (NULL == writeInputBufferInfo.calcInfo[vectorSize].program)
         {
             gFailCount++;
             return -1;
         }
         if (NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel)
         {
             gFailCount++;
             vlog_error("\t\tFAILED -- Failed to create kernel.\n");
             return -2;
         }

         writeInputBufferInfo.calcInfo[vectorSize].parent =
             &writeInputBufferInfo;
         writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
         writeInputBufferInfo.calcInfo[vectorSize].result = -1;
     }

     if (gSkipTesting) goto exit;

     // Patch up rounding mode if default is RTZ
     // We leave the part above in default rounding mode so that the right kernel
     // is compiled.
     if (round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat))
         init_info.round = round = kRoundTowardZero;

     // Figure out how many elements are in a work block

     // we handle 64-bit types a bit differently.
     if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;

     if (!gWimpyMode && gIsEmbedded)
         step = blockCount * EMBEDDED_REDUCTION_FACTOR;

     if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
     vlog("Testing... ");
     fflush(stdout);
     for (i = 0; i < (uint64_t)lastCase; i += step)
     {

         if (0 == (i & ((lastCase >> 3) - 1)))
         {
             vlog(".");
             fflush(stdout);
         }

         cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
         writeInputBufferInfo.count = count;

         // Crate a user event to represent the status of the reference value
         // computation completion
         writeInputBufferInfo.calcReferenceValues =
             clCreateUserEvent(gContext, &error);
         if (error || NULL == writeInputBufferInfo.calcReferenceValues)
         {
             vlog_error("ERROR: Unable to create user event. (%d)\n", error);
             gFailCount++;
             goto exit;
         }

         // retain for consumption by MapOutputBufferComplete
         for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
              vectorSize++)
         {
             if ((error =
                      clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
             {
                 vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
                 gFailCount++;
                 goto exit;
             }
         }

         // Crate a user event to represent when the callbacks are done verifying
         // correctness
         writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
         if (error || NULL == writeInputBufferInfo.calcReferenceValues)
         {
             vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
                        error);
             gFailCount++;
             goto exit;
         }

         // retain for use by the callback that calls this
         if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
         {
             vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
                        error);
             gFailCount++;
             goto exit;
         }

         //      Call this in a multithreaded manner
         //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d
         //      );
         cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
         init_info.start = i;
         init_info.size = count / chunks;
         if (init_info.size < 16384)
         {
             chunks = RoundUpToNextPowerOfTwo(threads);
             init_info.size = count / chunks;
             if (init_info.size < 16384)
             {
                 init_info.size = count;
                 chunks = 1;
             }
         }
         ThreadPool_Do(InitData, chunks, &init_info);

         // Copy the results to the device
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
                                           count * gTypeSizes[inType], gIn, 0,
                                           NULL, NULL)))
         {
             vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
             gFailCount++;
             goto exit;
         }

         // Call completion callback for the write, which will enqueue the rest
         // of the work.
         WriteInputBufferComplete((void *)&writeInputBufferInfo);

         // Make sure the work is actually running, so we don't deadlock
         if ((error = clFlush(gQueue)))
         {
             vlog_error("clFlush failed with error %d\n", error);
             gFailCount++;
             goto exit;
         }

         ThreadPool_Do(PrepareReference, chunks, &init_info);

         // signal we are done calculating the reference results
         if ((error = clSetUserEventStatus(
                  writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
         {
             vlog_error(
                 "Error:  Failed to set user event status to CL_COMPLETE:  %d\n",
                 error);
             gFailCount++;
             goto exit;
         }

         // Wait for the event callbacks to finish verifying correctness.
         if ((error = clWaitForEvents(
                  1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
         {
             vlog_error("Error:  Failed to wait for barrier:  %d\n", error);
             gFailCount++;
             goto exit;
         }

         if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
         {
             vlog_error("Error:  Failed to release calcReferenceValues:  %d\n",
                        error);
             gFailCount++;
             goto exit;
         }

         if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
         {
             vlog_error("Error:  Failed to release done barrier:  %d\n", error);
             gFailCount++;
             goto exit;
         }


         for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
              vectorSize++)
         {
             if ((error = writeInputBufferInfo.calcInfo[vectorSize].result))
             {
                 switch (inType)
                 {
                     case kuchar:
                     case kchar:
                         vlog("Input value: 0x%2.2x ",
                              ((unsigned char *)gIn)[error - 1]);
                         break;
                     case kushort:
                     case kshort:
                         vlog("Input value: 0x%4.4x ",
                              ((unsigned short *)gIn)[error - 1]);
                         break;
                     case kuint:
                     case kint:
                         vlog("Input value: 0x%8.8x ",
                              ((unsigned int *)gIn)[error - 1]);
                         break;
                     case kfloat:
                         vlog("Input value: %a ", ((float *)gIn)[error - 1]);
                         break;
                         break;
                     case kulong:
                     case klong:
                         vlog("Input value: 0x%16.16llx ",
                              ((unsigned long long *)gIn)[error - 1]);
                         break;
                     case kdouble:
                         vlog("Input value: %a ", ((double *)gIn)[error - 1]);
                         break;
                     default:
                         vlog_error("Internal error at %s: %d\n", __FILE__,
                                    __LINE__);
                         abort();
                         break;
                 }

                 // tell the user which conversion it was.
                 if (0 == vectorSize)
                     vlog(" (implicit scalar conversion from %s to %s)\n",
                          gTypeNames[inType], gTypeNames[outType]);
                 else
                     vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
                          sizeNames[vectorSize], gSaturationNames[sat],
                          gRoundingModeNames[round], gTypeNames[inType],
                          sizeNames[vectorSize]);

                 gFailCount++;
                 goto exit;
             }
         }
     }

     log_info("done.\n");

     if (gTimeResults)
     {
         // Kick off tests for the various vector lengths
         for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
              vectorSize++)
         {
             size_t workItemCount = blockCount / vectorSizes[vectorSize];
             if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
                 workItemCount /=
                     4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);

             double sum = 0.0;
             double bestTime = INFINITY;
             cl_uint k;
             for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
                 if ((error = RunKernel(
                          writeInputBufferInfo.calcInfo[vectorSize].kernel,
                          gInBuffer, gOutBuffers[vectorSize], workItemCount)))
                 {
                     gFailCount++;
                     goto exit;
                 }

                 // Make sure OpenCL is done
                 if ((error = clFinish(gQueue)))
                 {
                     vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }

                 uint64_t endTime = GetTime();
                 double time = SubtractTime(endTime, startTime);
                 sum += time;
                 if (time < bestTime) bestTime = time;
             }

             if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
             double clocksPerOp = bestTime * (double)gDeviceFrequency
                 * gComputeDevices * gSimdSize * 1e6
                 / (workItemCount * vectorSizes[vectorSize]);
             if (0 == vectorSize)
                 vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
                           "implicit convert %s -> %s", gTypeNames[inType],
                           gTypeNames[outType]);
             else
                 vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
                           "convert_%s%s%s%s( %s%s )", gTypeNames[outType],
                           sizeNames[vectorSize], gSaturationNames[sat],
                           gRoundingModeNames[round], gTypeNames[inType],
                           sizeNames[vectorSize]);
         }
     }

     if (gWimpyMode)
         vlog("\tWimp pass");
     else
         vlog("\tpassed");

 #ifdef __APPLE__
     // record the run time
     vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
 #endif
     vlog("\n\n");
     fflush(stdout);


 exit:
     // clean up
     for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
         clReleaseProgram(writeInputBufferInfo.calcInfo[vectorSize].program);
         clReleaseKernel(writeInputBufferInfo.calcInfo[vectorSize].kernel);
     }

     if (init_info.d)
     {
         for (i = 0; i < threads; i++) free_mtdata(init_info.d[i]);
         free(init_info.d);
     }

     return error;
 }

 void MapResultValuesComplete(void *data);

 // Note: not called reentrantly
 void WriteInputBufferComplete(void *data)
 {
     cl_int status;
     WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
     cl_uint count = info->count;
     int vectorSize;

     info->barrierCount = gMaxVectorSize - gMinVectorSize;

     // now that we know that the write buffer is complete, enqueue callbacks to
     // wait for the main thread to finish calculating the reference results.
     for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
         size_t workItemCount =
             (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);

         if ((status = RunKernel(info->calcInfo[vectorSize].kernel, gInBuffer,
                                 gOutBuffers[vectorSize], workItemCount)))
         {
             gFailCount++;
             return;
         }

         info->calcInfo[vectorSize].p = clEnqueueMapBuffer(
             gQueue, gOutBuffers[vectorSize], CL_TRUE,
             CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
             NULL, NULL, &status);
         {
             if (status)
             {
                 vlog_error("ERROR: WriteInputBufferComplete calback failed "
                            "with status: %d\n",
                            status);
                 gFailCount++;
                 return;
             }
         }
     }

     for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
         MapResultValuesComplete(info->calcInfo + vectorSize);
     }

     // Make sure the work starts moving -- otherwise we may deadlock
     if ((status = clFlush(gQueue)))
     {
         vlog_error(
             "ERROR: WriteInputBufferComplete calback failed with status: %d\n",
             status);
         gFailCount++;
         return;
     }

     // e was already released by the main thread. It should be destroyed
     // automatically soon after we exit.
 }

 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
                                              void *data);

 // Note: May be called reentrantly
 void MapResultValuesComplete(void *data)
 {
     cl_int status;
     CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
     cl_event calcReferenceValues = info->parent->calcReferenceValues;

     // we know that the map is done, wait for the main thread to finish
     // calculating the reference values
     if ((status = clSetEventCallback(calcReferenceValues, CL_COMPLETE,
                                      CalcReferenceValuesComplete, data)))
     {
         vlog_error("ERROR: clSetEventCallback failed in "
                    "MapResultValuesComplete with status: %d\n",
                    status);
         gFailCount++; // not thread safe -- being lazy here
     }

     // this thread no longer needs its reference to info->calcReferenceValues,
     // so release it
     if ((status = clReleaseEvent(calcReferenceValues)))
     {
         vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
                    "with status: %d\n",
                    status);
         gFailCount++; // not thread safe -- being lazy here
     }

     // no need to flush since we didn't enqueue anything

     // e was already released by WriteInputBufferComplete. It should be
     // destroyed automatically soon after we exit.
 }


 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
                                              void *data)
 {
     CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
     cl_uint vectorSize = info->vectorSize;
     cl_uint count = info->parent->count;
     Type outType =
         info->parent->outType; // the data type of the conversion result
     Type inType = info->parent->inType; // the data type of the conversion input
     size_t j;
     cl_int error;
     cl_event doneBarrier = info->parent->doneBarrier;

     // report spurious error condition
     if (CL_SUCCESS != status)
     {
         vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
                    status);
         gFailCount++; // lazy about thread safety here
         return;
     }

     // Now we know that both results have been mapped back from the device, and
     // the main thread is done calculating the reference results. It is now time
     // to check the results.

     // verify results
     void *mapped = info->p;

     // Patch up NaNs conversions to integer to zero -- these can be converted to
     // any integer
     if (outType != kfloat && outType != kdouble)
     {
         if (inType == kfloat)
         {
             float *inp = (float *)gIn;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j]))
                     memset((char *)mapped + j * gTypeSizes[outType], 0,
                            gTypeSizes[outType]);
             }
         }
         if (inType == kdouble)
         {
             double *inp = (double *)gIn;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j]))
                     memset((char *)mapped + j * gTypeSizes[outType], 0,
                            gTypeSizes[outType]);
             }
         }
     }
     else if (inType == kfloat || inType == kdouble)
     { // outtype and intype is float or double.  NaN conversions for float <->
       // double can be any NaN
         if (inType == kfloat && outType == kdouble)
         {
             float *inp = (float *)gIn;
             double *outp = (double *)mapped;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
             }
         }
         if (inType == kdouble && outType == kfloat)
         {
             double *inp = (double *)gIn;
             float *outp = (float *)mapped;
             for (j = 0; j < count; j++)
             {
                 if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
             }
         }
     }

     if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
         info->result = gCheckResults[outType](mapped, gRef, gAllowZ, count,
                                               vectorSizes[vectorSize]);
     else
         info->result = 0;

     // Fill the output buffer with junk and release it
     {
         cl_uint pattern = 0xffffdead;
         memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
         if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
                                              mapped, 0, NULL, NULL)))
         {
             vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
                        "CalcReferenceValuesComplete  (%d)\n",
                        error);
             gFailCount++;
         }
     }

     if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
     {
         if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
         {
             vlog_error("ERROR: clSetUserEventStatus failed in "
                        "CalcReferenceValuesComplete (err: %d). We're probably "
                        "going to deadlock.\n",
                        status);
             gFailCount++;
             return;
         }

         if ((status = clReleaseEvent(doneBarrier)))
         {
             vlog_error("ERROR: clReleaseEvent failed in "
                        "CalcReferenceValuesComplete (err: %d).\n",
                        status);
             gFailCount++;
             return;
         }
     }
     // e was already released by WriteInputBufferComplete. It should be
     // destroyed automatically soon after all the calls to
     // CalcReferenceValuesComplete exit.
 }

 static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
                               RoundingMode round, int vectorSize,
                               cl_kernel *outKernel)
 {
     cl_program program;
     char testName[256];
     int error = 0;

     std::ostringstream source;
     if (outType == kdouble || inType == kdouble)
         source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";

     // Create the program. This is a bit complicated because we are trying to
     // avoid byte and short stores.
     if (0 == vectorSize)
     {
         // Create the type names.
         char inName[32];
         char outName[32];
         strncpy(inName, gTypeNames[inType], sizeof(inName));
         strncpy(outName, gTypeNames[outType], sizeof(outName));
         sprintf(testName, "test_implicit_%s_%s", outName, inName);

         source << "__kernel void " << testName << "( __global " << inName
                << " *src, __global " << outName << " *dest )\n";
         source << "{\n";
         source << "   size_t i = get_global_id(0);\n";
         source << "   dest[i] =  src[i];\n";
         source << "}\n";

         vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
              gTypeNames[outType]);
         fflush(stdout);
     }
     else
     {
         int vectorSizetmp = vectorSizes[vectorSize];

         // Create the type names.
         char convertString[128];
         char inName[32];
         char outName[32];
         switch (vectorSizetmp)
         {
             case 1:
                 strncpy(inName, gTypeNames[inType], sizeof(inName));
                 strncpy(outName, gTypeNames[outType], sizeof(outName));
                 snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
                          outName, gSaturationNames[sat],
                          gRoundingModeNames[round]);
                 snprintf(testName, 256, "test_%s_%s", convertString, inName);
                 vlog("Building %s( %s ) test\n", convertString, inName);
                 break;
             case 3:
                 strncpy(inName, gTypeNames[inType], sizeof(inName));
                 strncpy(outName, gTypeNames[outType], sizeof(outName));
                 snprintf(convertString, sizeof(convertString),
                          "convert_%s3%s%s", outName, gSaturationNames[sat],
                          gRoundingModeNames[round]);
                 snprintf(testName, 256, "test_%s_%s3", convertString, inName);
                 vlog("Building %s( %s3 ) test\n", convertString, inName);
                 break;
             default:
                 snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
                          vectorSizetmp);
                 snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
                          vectorSizetmp);
                 snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
                          outName, gSaturationNames[sat],
                          gRoundingModeNames[round]);
                 snprintf(testName, 256, "test_%s_%s", convertString, inName);
                 vlog("Building %s( %s ) test\n", convertString, inName);
                 break;
         }
         fflush(stdout);

         if (vectorSizetmp == 3)
         {
             source << "__kernel void " << testName << "( __global " << inName
                    << " *src, __global " << outName << " *dest )\n";
             source << "{\n";
             source << "   size_t i = get_global_id(0);\n";
             source << "   if( i + 1 < get_global_size(0))\n";
             source << "       vstore3( " << convertString
                    << "( vload3( i, src)), i, dest );\n";
             source << "   else\n";
             source << "   {\n";
             source << "       " << inName << "3 in;\n";
             source << "       " << outName << "3 out;\n";
             source << "       if( 0 == (i & 1) )\n";
             source << "           in.y = src[3*i+1];\n";
             source << "       in.x = src[3*i];\n";
             source << "       out = " << convertString << "( in ); \n";
             source << "       dest[3*i] = out.x;\n";
             source << "       if( 0 == (i & 1) )\n";
             source << "           dest[3*i+1] = out.y;\n";
             source << "   }\n";
             source << "}\n";
         }
         else
         {
             source << "__kernel void " << testName << "( __global " << inName
                    << " *src, __global " << outName << " *dest )\n";
             source << "{\n";
             source << "   size_t i = get_global_id(0);\n";
             source << "   dest[i] = " << convertString << "( src[i] );\n";
             source << "}\n";
         }
     }
     *outKernel = NULL;

     const char *flags = NULL;
     if (gForceFTZ) flags = "-cl-denorms-are-zero";

     // build it
     std::string sourceString = source.str();
     const char *programSource = sourceString.c_str();
     error = create_single_kernel_helper(gContext, &program, outKernel, 1,
                                         &programSource, testName, flags);
     if (error)
     {
         vlog_error("Failed to build kernel/program (err = %d).\n", error);
         clReleaseProgram(program);
         return NULL;
     }

     return program;
 }