[bazel] enable sccache+nvcc in CI (#95528)

Fixes #79348

This change is mostly focused on enabling nvcc+sccache in the PyTorch CI.

Along the way we had to do couple tweaks:
1.  Split the rules_cc from the rules_cuda that embeeded them before. This is needed in order to apply a different patch to the rules_cc compare to the one that rules_cuda does by default. This is in turn needed because we need to workaround an nvcc behavior where it doesn't send `-iquote xxx` to the host compiler, but it does send `-isystem xxx`. So we workaround this problem with (ab)using `-isystem` instead. Without it we are getting errors like `xxx` is not found.

2. Workaround bug in bazel https://github.com/bazelbuild/bazel/issues/10167 that prevents us from using a straightforward and honest `nvcc` sccache wrapper. Instead we generate ad-hock bazel specific nvcc wrapper that has internal knowledge of the relative bazel paths to local_cuda. This allows us to workaround the issue with CUDA symlinks. Without it we are getting `undeclared inclusion(s) in rule` all over the place for CUDA headers.

## Test plan

Green CI build https://github.com/pytorch/pytorch/actions/runs/4267147180/jobs/7428431740

Note that now it says "CUDA" in the sccache output

```
+ sccache --show-stats
Compile requests                    9784
Compile requests executed           6726
Cache hits                          6200
Cache hits (C/C++)                  6131
Cache hits (CUDA)                     69
Cache misses                         519
Cache misses (C/C++)                 201
Cache misses (CUDA)                  318
Cache timeouts                         0
Cache read errors                      0
Forced recaches                        0
Cache write errors                     0
Compilation failures                   0
Cache errors                           7
Cache errors (C/C++)                   7
Non-cacheable compilations             0
Non-cacheable calls                 2893
Non-compilation calls                165
Unsupported compiler calls             0
Average cache write                0.116 s
Average cache read miss           23.722 s
Average cache read hit             0.057 s
Failed distributed compilations        0
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/95528
Approved by: https://github.com/huydhn
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index fd0af8c..cfca6fa 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -191,6 +191,7 @@
   set -e
 
   get_bazel
+  install_sccache_nvcc_for_bazel
 
   # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
   # the runner
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index e4172c6..c344b9b 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -95,6 +95,26 @@
   chmod +x tools/bazel
 }
 
+# This function is bazel specific because of the bug
+# in the bazel that requires some special paths massaging
+# as a workaround. See
+# https://github.com/bazelbuild/bazel/issues/10167
+function install_sccache_nvcc_for_bazel() {
+  sudo mv /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc-real
+
+  # Write the `/usr/local/cuda/bin/nvcc`
+  cat << EOF | sudo tee /usr/local/cuda/bin/nvcc
+#!/bin/sh
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache /usr/local/cuda/bin/nvcc "\$@"
+else
+  exec external/local_cuda/cuda/bin/nvcc-real "\$@"
+fi
+EOF
+
+  sudo chmod +x /usr/local/cuda/bin/nvcc
+}
+
 function install_monkeytype {
   # Install MonkeyType
   pip_install MonkeyType
diff --git a/.lintrunner.toml b/.lintrunner.toml
index dd94aae..940dea3 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -367,6 +367,7 @@
 exclude_patterns = [
     '**/contrib/**',
     '**/*.diff',
+    '**/*.patch',
     'third_party/**',
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
     'test/cpp/jit/upgrader_models/*.ptl',
diff --git a/WORKSPACE b/WORKSPACE
index c016da0..9272e44 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,6 +4,18 @@
 load("//tools/rules:workspace.bzl", "new_patched_local_repository")
 
 http_archive(
+    name = "rules_cc",
+    strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556",
+    patches = [
+        "//:tools/rules_cc/cuda_support.patch",
+    ],
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.tar.gz",
+        "https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.tar.gz",
+    ],
+)
+
+http_archive(
     name = "rules_cuda",
     strip_prefix = "runtime-b1c7cce21ba4661c17ac72421c6a0e2015e7bef3/third_party/rules_cuda",
     urls = ["https://github.com/tensorflow/runtime/archive/b1c7cce21ba4661c17ac72421c6a0e2015e7bef3.tar.gz"],
@@ -11,7 +23,7 @@
 
 load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies")
 
-rules_cuda_dependencies()
+rules_cuda_dependencies(with_rules_cc = False)
 
 load("@rules_cc//cc:repositories.bzl", "rules_cc_toolchains")
 
diff --git a/tools/rules_cc/cuda_support.patch b/tools/rules_cc/cuda_support.patch
new file mode 100644
index 0000000..d097eee
--- /dev/null
+++ b/tools/rules_cc/cuda_support.patch
@@ -0,0 +1,80 @@
+diff --git cc/private/toolchain/unix_cc_configure.bzl cc/private/toolchain/unix_cc_configure.bzl
+index ba992fc..e4e8364 100644
+--- cc/private/toolchain/unix_cc_configure.bzl
++++ cc/private/toolchain/unix_cc_configure.bzl
+@@ -27,6 +27,7 @@ load(
+     "which",
+     "write_builtin_include_directory_paths",
+ )
++load("@rules_cuda//cuda:toolchain.bzl", "cuda_compiler_deps")
+ 
+ def _field(name, value):
+     """Returns properly indented top level crosstool field."""
+@@ -397,7 +398,7 @@ def configure_unix_toolchain(repository_ctx, cpu_value, overriden_tools):
+     cxx_opts = split_escaped(get_env_var(
+         repository_ctx,
+         "BAZEL_CXXOPTS",
+-        "-std=c++0x",
++        "-std=c++11",
+         False,
+     ), ":")
+ 
+@@ -463,7 +464,7 @@ def configure_unix_toolchain(repository_ctx, cpu_value, overriden_tools):
+             )),
+             "%{cc_compiler_deps}": get_starlark_list([":builtin_include_directory_paths"] + (
+                 [":cc_wrapper"] if darwin else []
+-            )),
++            ) + cuda_compiler_deps()),
+             "%{cc_toolchain_identifier}": cc_toolchain_identifier,
+             "%{compile_flags}": get_starlark_list(
+                 [
+diff --git cc/private/toolchain/unix_cc_toolchain_config.bzl cc/private/toolchain/unix_cc_toolchain_config.bzl
+index c3cf3ba..1744eb4 100644
+--- cc/private/toolchain/unix_cc_toolchain_config.bzl
++++ cc/private/toolchain/unix_cc_toolchain_config.bzl
+@@ -25,6 +25,7 @@ load(
+     "variable_with_value",
+     "with_feature_set",
+ )
++load("@rules_cuda//cuda:toolchain.bzl", "cuda_toolchain_config")
+ 
+ all_compile_actions = [
+     ACTION_NAMES.c_compile,
+@@ -580,7 +581,8 @@ def _impl(ctx):
+                 ],
+                 flag_groups = [
+                     flag_group(
+-                        flags = ["-iquote", "%{quote_include_paths}"],
++                        # -isystem because there is an nvcc thing where it doesn't forward -iquote to host compiler.
++                        flags = ["-isystem", "%{quote_include_paths}"],
+                         iterate_over = "quote_include_paths",
+                     ),
+                     flag_group(
+@@ -1152,10 +1154,15 @@ def _impl(ctx):
+             unfiltered_compile_flags_feature,
+         ]
+ 
++    cuda = cuda_toolchain_config(
++        cuda_toolchain_info = ctx.attr._cuda_toolchain_info,
++        compiler_path = ctx.attr.tool_paths["gcc"],
++    )
++
+     return cc_common.create_cc_toolchain_config_info(
+         ctx = ctx,
+-        features = features,
+-        action_configs = action_configs,
++        features = features + cuda.features,
++        action_configs = action_configs + cuda.action_configs,
+         cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+         toolchain_identifier = ctx.attr.toolchain_identifier,
+         host_system_name = ctx.attr.host_system_name,
+@@ -1192,6 +1199,9 @@ cc_toolchain_config = rule(
+         "tool_paths": attr.string_dict(),
+         "toolchain_identifier": attr.string(mandatory = True),
+         "unfiltered_compile_flags": attr.string_list(),
++        "_cuda_toolchain_info": attr.label(
++            default = Label("@rules_cuda//cuda:cuda_toolchain_info"),
++        ),
+     },
+     provides = [CcToolchainConfigInfo],
+ )