# CUDA_PROFILE_LOG_VERSION 1.5
# CUDA_DEVICE 0 Tesla C1060
# TIMESTAMPFACTOR fd4920a156863f8
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 3.744 ] cputime=[ 2.000 ]
method=[ memcpyHtoD ] gputime=[ 3.968 ] cputime=[ 1.000 ]
method=[ _Z6vecAddPiS_S_ ] gputime=[ 6.656 ] cputime=[ 8.000 ] occupancy=[ 0.031 ]
method=[ memcpyDtoH ] gputime=[ 4.416 ] cputime=[ 17.000 ]
# Add source files here
EXECUTABLE := yourprojectName
# Cuda source files (compiled with cudacc)
CUFILES := yourprojectName.cu
(make changes to the yourprojectName.cu file and yourprojectName_kernel.cu file)
################################################################################
# Add source files here
EXECUTABLE := saj
# Cuda source files (compiled with cudacc)
CUFILES := saj.cu
# C/C++ source files (compiled with gcc / c++)
CCFILES :=
################################################################################
# Rules and targets
include ../../common/common.mk
/*
Hello world Program to compute the sum of two arrays of size N using GPU
(not used blocksDim and blockIdx and grid concepts, so that any body can familier with CUDA)
@author Sajan Kumar.S
@email: nospam+ammasajan[A.T]gmail[.]com
*/
#include <stdio.h>
#include <stdlib.h>
#define N 20 // 20 elements
__global__ void vecAdd(int *A, int *B, int *C){
int i=threadIdx.x;
__shared__ int s_A[N],s_B[N],s_C[N]; // N Value depends on size of shared memory
// copy the values to shared mem and attack! :D
s_A[i]=A[i];
s_B[i]=B[i];
__syncthreads();
// C[i]=A[i]+B[i];
// s_C[i]=s_A[i]+s_B[i]; // to calucate the sume of elements
s_C[i]=s_A[i]*s_B[i]; // to caluclate the sume of elements
__syncthreads();
C[i]=s_C[i];
}
int main(){
int *h_a=0,*h_b=0,*h_c=0;
int *d_a=0,*d_b=0,*d_c=0;
int memSize=N*sizeof(int);
// allocate host memory size of N
h_a=(int *)malloc(memSize);
h_b=(int *)malloc(memSize);
h_c=(int *)malloc(memSize);
// allocate GPU memory size of N
cudaMalloc((void **)&d_a,memSize);
cudaMalloc((void **)&d_b,memSize);
cudaMalloc((void **)&d_c,memSize);
// Init values to A and B arrays(clearing C array)
for(int i=0;i<N;i++){
h_a[i]=i+2;
h_b[i]=i+3;
h_c[i]=0;
}
// Copied the values to GPU arrays A and B
cudaMemcpy(d_a,h_a,memSize,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,memSize,cudaMemcpyHostToDevice);
// printing the A array and B array on CPU
printf("\n Array A : \n");
for(int i=0;i<N;i++)
printf("%d\t",h_a[i]);
printf("\n Array B : \n");
for(int i=0;i<N;i++)
printf("%d\t",h_b[i]);
printf("\ncalucalting Sum : ");
vecAdd<<<1, N>>>(d_a,d_b,d_c);
// copying the output C from GPU to mem
cudaMemcpy(h_c,d_c,memSize,cudaMemcpyDeviceToHost);
printf("\nSum of Arrays: \n");
for(int i=0;i<N;i++)
printf("%d\t",h_c[i]);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 1;
}
Have a question about something in this article? You can receive help directly from the article author. Sign up for a free trial to get started.
Comments (0)