cudaDeviceSynchronize () код ошибки 77: cudaErrorIllegalAddress

большое спасибо за чтение моей темы.

Я делаю работу CUDA, но продолжаю получать код ошибки cudadevicesynchronize() 77: cudaErrorIllegalAddress, без какой-либо идеи, почему. Я сделал поиск как кода, так и функции , к удивлению, появилось только несколько записей. Очень странный.

Я в основном суммирую все пиксели изображений. Чтобы мои вопросы имели как можно больше ссылок, я показываю весь свой код CUDA здесь:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>

using namespace std;

float random_float(void)
{
  return static_cast<float>(rand()) / RAND_MAX;
}


__global__ void reduceSum(unsigned short *input,
                          unsigned long long *per_block_results,
                          const int n)
{
    extern __shared__ unsigned long long sdata[];

    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    unsigned short x = 0;
    if(i < n)
    {
        x = input[i];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
    {
        if(threadIdx.x < offset)
        {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }

        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0)
    {
        per_block_results[blockIdx.x] = sdata[0];
    }
}

// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean,  int N,  vector<string> filelist)
{
    int size = N*N;
    unsigned long long* dev_sum = 0;
    unsigned short* dev_img = 0;
    cudaError_t cudaStatus;
    const int block_size = 512;
    const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
    int L = filelist.size();

    // Choose which GPU to run on, change this on a multi-GPU system.

    double totalgpuinittime = 0;
    StartCounter(7);

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    totalgpuinittime = GetCounter(7);

    unsigned short* img;
    unsigned short* pimg;
    unsigned long long* sum = new unsigned long long[num_blocks];
    unsigned long long* psum = sum;

    cout<<endl;
    cout << "gpu looping starts, and in progress ..." << endl;
    StartCounter(6);

    double totalfileiotime = 0;
    double totalh2dcpytime = 0;
    double totalkerneltime = 0;
    double totald2hcpytime = 0;
    double totalcpusumtime = 0;
    double totalloopingtime = 0;

    for (int k = 0; k < L; k++)
    {
        StartCounter(1);
        img = (unsigned short*)LoadTIFF(filelist[k].c_str());
        totalfileiotime += GetCounter(1);

        psum = sum;
        pimg = img;

        float gpumean = 0;

        memset(psum, 0, sizeof(unsigned long long)*num_blocks);

        StartCounter(2);
        // Copy input vectors from host memory to GPU buffers.
        cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }

        cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }   

        totalh2dcpytime += GetCounter(2);

        StartCounter(3);
        //reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
         //reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
          reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
        totalkerneltime += GetCounter(3);

      // Check for any errors launching the kernel
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "reduction Kernel launch failed: %sn", cudaGetErrorString(cudaStatus));
            goto Error;
        }

        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.

                // !!!!!! following is where the code 77 error occurs!!!!!!!
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
            goto Error;
        }

        // Copy output vector from GPU buffer to host memory.
        StartCounter(4);
        cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        totald2hcpytime += GetCounter(4);

        StartCounter(5);
        for (int i = 0; i < num_blocks; i++)
        {
            gpumean += *psum;
            psum++;
        }

        gpumean /= N*N;
        totalcpusumtime += GetCounter(5);

        delete img; 
        img = NULL;

        cout<<gpumean<<endl;

    }

    int S = 1e+6;
    int F = filelist.size();
    float R = S/F;

    totalloopingtime = GetCounter(6);
    cout<<"gpu looping ends."<<endl<<endl;
    cout<< "analysis:"<<endl;
    cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
    cout<<"file I/O time: "<<endl;
    cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
    cout<<"host-to-device copy time: "<<endl;
    cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
    cout<<"pure gpu kerneling time: "<<endl;
    cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
    cout<<"device-to-host copy time: "<<endl;
    cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
    /*cout<<"cpu summing time: "<<endl;
    cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/

    /*cout <<"gpu looping time: " << endl;
    cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/


Error:
    cudaFree(dev_sum);
    cudaFree(dev_img);

    delete sum;
    sum = NULL;

    return cudaStatus;
}

void kernel(float* &mean, int N, vector<string> filelist)
{
    // wrapper and kernel
    cudaError_t cudaStatus = gpuWrapper(mean, N,  filelist);

    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "gpuWapper failed!");

    }

   // printf("mean is: %fn", mean);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    StartCounter(8);
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaDeviceReset failed!");

    }
    cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
    //return *mean;
}

Я назначено достаточное и эквивалентное пространство памяти как для хоста, так и для памяти устройства. Любые комментарии приветствуются.

1 ответов


хотя это может быть не единственным источником ошибки в коде, вы не выделяете динамическую общую память для ядра сокращения, что приводит к ошибке незаконной адресации, которую вы видите. Правильный запуск ядра должен быть чем-то вроде

size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);

это выделяет эквивалент одного unsigned long long для каждого потока, запущенного в ядре сокращения, который (по моему очень беглому чтению вашего кода) должен сделать массив общей памяти sdata правильный размер ядро для запуска без доступа за пределы этого массива.