I had tried to fix this, and it works on my Windows 11 with cuda 11.8 and Pytorch 2.0.0 and no use Docker 🎊. Happy coding 😊.

NO DOCKER required
OS: window 11
Cuda: 11.8

For torch version: Previous Torch Version

pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118

`.cu` files

Since the torch 2.0.0 had removed the THC libs which creating many errors.

deform_conv_cuda.cu

#include <THC/THC.h> // remove this import

// add all import below
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/ceil_div.h>
#include <ATen/cuda/ThrustAllocator.h>

deform_pool_cuda.cu


#include <THC/THC.h> // remove this import

// add all import below

include <ATen/cuda/CUDAContext.h>

include <ATen/cuda/CUDAEvent.h>

include <ATen/ceil_div.h>

include <ATen/cuda/ThrustAllocator.h>


3. `ml_nms.cu`
```C
#include <THC/THC.h> // remove this import

// add all import below
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/ceil_div.h>
#include <ATen/cuda/ThrustAllocator.h>
...
// line 69: change this 
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 
// to
const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
...
// line 84: change this 
const int col_blocks =  THCCeilDiv(boxes_num, threadsPerBlock);
//to
const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);
...
//line 88:
THCState *state = at::globalContext().lazyInitCUDA() // remove this
...
//line 91:
THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, boxes_num * col_blocks * sizeof(unsigned long long))); // remove this
...
//line 94: change this
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
//to
mask_dev= (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc( boxes_num * col_blocks * sizeof(unsigned long long));
...
//line 97: change this
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),THCCeilDiv(boxes_num, threadsPerBlock));
// to
dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),at::ceil_div(boxes_num, threadsPerBlock));
...
//line 107: change this
THCudaCheck(cudaMemcpy(&mask_host[0],  mask_dev,sizeof(unsigned long long) * boxes_num * col_blocks,cudaMemcpyDeviceToHost));
//to
C10_CUDA_CHECK(cudaMemcpy(&mask_host[0],
                        mask_dev,
                        sizeof(unsigned long long) * boxes_num * col_blocks,
                        cudaMemcpyDeviceToHost));
...
//line 130: change this
THCudaFree(state, mask_dev);
// to
c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);

nms.cu:

#include <THC/THC.h> // remove this import

// add all import below
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/ceil_div.h>
#include <ATen/cuda/ThrustAllocator.h>
...
// line 64: change this 
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 
// to
const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
...
// line 79: change this 
const int col_blocks =  THCCeilDiv(boxes_num, threadsPerBlock);
//to
const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);
...
//line 83:
THCState *state = at::globalContext().lazyInitCUDA() // remove this
...
//line 86:
THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, boxes_num * col_blocks * sizeof(unsigned long long))); // remove this
...
//line 89: change this
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
// to
mask_dev= (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc( boxes_num * col_blocks * sizeof(unsigned long long));
...
//line 91: change this
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),THCCeilDiv(boxes_num, threadsPerBlock));
// to:
dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),at::ceil_div(boxes_num, threadsPerBlock));
...
//line 100: change this
THCudaCheck(cudaMemcpy(&mask_host[0], mask_dev,
                        sizeof(unsigned long long) * boxes_num * col_blocks,
                        cudaMemcpyDeviceToHost));
// to
C10_CUDA_CHECK(cudaMemcpy(&mask_host[0],
                        mask_dev,
                        sizeof(unsigned long long) * boxes_num * col_blocks,
                        cudaMemcpyDeviceToHost));
...
// line 125: change this
THCudaFree(state, mask_dev);
// to
c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);

ROIAlign_cuda.cu, ROIPool_cuda.cu:


#include <THC/THC.h> // remove this import

// add all import below

include <ATen/ceil_div.h>

include <ATen/cuda/CUDAContext.h>

include <ATen/cuda/CUDAEvent.h>

include <ATen/ceil_div.h>

include <ATen/cuda/ThrustAllocator.h>

... // line 275: change this dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L)); // to dim3 grid(std::min(((int)output_size +512-1)/512, 4096)); ... // change this THCudaCheck(cudaGetLastError()); // to C10_CUDA_CHECK(cudaGetLastError()); ... // line 320: change this dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L)); // to dim3 grid(std::min(((int)(grad.numel())+512-1)/512, 4096));


6. `SigmoidFocalLoss_cuda.cu` :
```C
#include <THC/THC.h> // remove this import

// add all import below
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/ceil_div.h>
#include <ATen/cuda/ThrustAllocator.h>
...
// change this
THCudaCheck(cudaGetLastError());
// to
C10_CUDA_CHECK(cudaGetLastError());
...
// change this
dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
// to
dim3 grid(std::min(((int)losses_size+512-1)/512, 4096));
...
// change this
dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
// to
dim3 grid(std::min(((int)d_logits_size+512-1)/512, 4096));

`.py` files

box_aug.py,build.py

# change
if cfg.INPUT.FORMAT is not '':
# to
if cfg.INPUT.FORMAT != '':

lvis_eval.py


# change
tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)

to

tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float32)
fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float32)


9. `deform_conv.py`,`nms.py`,`roi_align.py`,`roi_pool.py`,`sigmoid_focal_lost.py`
```Python
# change
from maskrcnn_benchmark import _C
# to
import maskrcnn_benchmark._C as _C

clip_model.py

# change
or pretrained_layers[0] is '*'
# to
or pretrained_layers[0] == '*'

anchor_generator.py

#change:
np.array(sizes, dtype=np.float) / stride,
np.array(aspect_ratios, dtype=np.float),
# to
np.array(sizes, dtype=np.float32) / stride,
np.array(aspect_ratios, dtype=np.float32),
...
#change:
anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1
# to
anchor = np.array([1, 1, base_size, base_size], dtype=np.float32) - 1

big_model_loading.py,c2_model_loading.py,pretrain_model_loading.py:

#change
            if old_key.find(param) is -1:
#to
if old_key.find(param) == -1:

imports.py:

# if torch._six:
import importlib
import importlib.util
import sys
# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
def import_file(module_name, file_path, make_importable=False):
spec = importlib.util.spec_from_file_location(module_name, file_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
if make_importable:
    sys.modules[module_name] = module
return module

model_zoo.py:

#change :
from torch.hub import _download_url_to_file
# to:
from torch.hub import download_url_to_file
...
#change :
from torch.utils.model_zoo import _download_url_to_file
#to 
from torch.hub import download_url_to_file

microsoft / GLIP

How to fix the demo to the Pytorch 2.0.0 - Cuda 11.8 #164

`.cu` files

include <ATen/cuda/CUDAContext.h>

include <ATen/cuda/CUDAEvent.h>

include <ATen/ceil_div.h>

include <ATen/cuda/ThrustAllocator.h>

include <ATen/ceil_div.h>

include <ATen/cuda/CUDAContext.h>

include <ATen/cuda/CUDAEvent.h>

include <ATen/ceil_div.h>

include <ATen/cuda/ThrustAllocator.h>

`.py` files

to

microsoft / GLIP

How to fix the demo to the Pytorch 2.0.0 - Cuda 11.8 #164

.cu files

include <ATen/cuda/CUDAContext.h>

include <ATen/cuda/CUDAEvent.h>

include <ATen/ceil_div.h>

include <ATen/cuda/ThrustAllocator.h>

include <ATen/ceil_div.h>

include <ATen/cuda/CUDAContext.h>

include <ATen/cuda/CUDAEvent.h>

include <ATen/ceil_div.h>

include <ATen/cuda/ThrustAllocator.h>

.py files

to

`.cu` files

`.py` files