CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
device_dump.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 
26 #pragma once
27 
28 #include <stdio.h>
29 #include "cutlass/cutlass.h"
30 
37 namespace cutlass {
38 namespace debug {
39 
40 /******************************************************************************
41  * Dump the fragments
42  ******************************************************************************/
43 
47 template <typename Fragment>
48 CUTLASS_DEVICE void dump_fragment(Fragment const& frag, int N = 0, int M = 0,
49  int S = 1) {
50  int total_threads = blockDim.x * blockDim.y * blockDim.z;
51  int block_id =
52  blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
53  int thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
54  (threadIdx.y * blockDim.x) + threadIdx.x;
55 
56  if (N < 0 || N > total_threads) {
57  if (thread_id == 0 && block_id == 0)
58  printf("Thread number N = %d should between [1, %d].\n", N,
59  total_threads);
60 
61  __syncthreads();
62 
63  return;
64  }
65 
66  int total_elements = frag.size();
67 
68  if (M < 0 || M > total_elements) {
69  if (thread_id == 0 && block_id == 0)
70  printf("Element number M = %d should between [1, %d].\n", M,
71  total_elements);
72 
73  __syncthreads();
74 
75  return;
76  }
77 
78  if (N == 0) N = total_threads;
79 
80  if (M == 0) M = total_elements;
81 
82  if (S < 1 || S > M) {
83  if (thread_id == 0 && block_id == 0)
84  printf("Stride S = %d should between [1, %d].\n", S, M);
85 
86  __syncthreads();
87 
88  return;
89  }
90 
91  if (thread_id == 0 && block_id == 0)
92  printf("\n*******************Dumping the fragments*******************\n\n");
93 
95  for (int tid = 0; tid < N; ++tid) {
96  if (tid == thread_id) {
97  printf("TB%d W%d T%d: ", block_id, tid / 32, tid & 31);
99  for (int i = 0; i < M; i += S) {
100  printf("%.0f ", float(typename Fragment::value_type(frag[i])));
101  }
102  printf("\n");
103  }
104 
105  __syncthreads();
106  }
107 
108  if (thread_id == 0 && block_id == 0)
109  printf("\n***********************************************************\n\n");
110 
111  __syncthreads();
112 
113  return;
114 }
115 
116 /******************************************************************************
117  * Dump the shared memory
118  ******************************************************************************/
119 
120 #define SHMEM_ROW_SIZE 128
121 
124 template <typename Element>
125 CUTLASS_DEVICE void dump_shmem(Element const* ptr, size_t size, int S = 1) {
126  int block_id =
127  blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
128  int thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
129  (threadIdx.y * blockDim.x) + threadIdx.x;
130 
131  if (ptr == nullptr) {
132  if (thread_id == 0 && block_id == 0) printf("ptr is null.\n");
133 
134  __syncthreads();
135  return;
136  }
137 
138  if (size < 1) {
139  if (thread_id == 0 && block_id == 0)
140  printf("Element size is less than 1\n");
141 
142  __syncthreads();
143 
144  return;
145  }
146 
147  int row_elements = SHMEM_ROW_SIZE / sizeof(Element);
148 
149  if (S < 1 || S > row_elements) {
150  if (thread_id == 0 && block_id == 0)
151  printf("Stride S = %d should between [1, %d].\n", S, row_elements);
152 
153  __syncthreads();
154 
155  return;
156  }
157 
158  __syncthreads();
159 
160  if (thread_id == 0)
161  printf("\n********Dumping the shared memory of TB %d*******\n\n", block_id);
162 
163  if (thread_id == 0) {
164  for (int i = 0; i < size; i += row_elements) {
165  for (int j = 0; j < row_elements; j += S) {
166  printf("%.0f ", float(ptr[i + j]));
167  }
168 
169  printf("\n");
170  }
171  }
172 
173  if (thread_id == 0)
174  printf("\n***********************************************************\n\n");
175 
176  __syncthreads();
177 
178  return;
179 }
180 } // namespace debug
181 } // namespace cutlass
Definition: aligned_buffer.h:35
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_DEVICE void dump_fragment(Fragment const &frag, int N=0, int M=0, int S=1)
Definition: device_dump.h:48
#define CUTLASS_PRAGMA_NO_UNROLL
Definition: cutlass.h:111
#define SHMEM_ROW_SIZE
Definition: device_dump.h:120
Basic include for CUTLASS.
CUTLASS_DEVICE void dump_shmem(Element const *ptr, size_t size, int S=1)
Definition: device_dump.h:125