Tutorial 8 - GPU Data Node
The GPU data node creates various attributes for use in a CUDA-based GPU compute. Several representative types are used, though the list of potential attribute types is not exhaustive. See omni.graph.docs.ogn_attribute_types for the full list.
This node also introduces the notion of attribute typedefs; a useful concept when passing data around in functions.
OgnTutorialCudaData.ogn
The ogn file shows the implementation of a node named “omni.graph.tutorials.CudaData”, which has inputs and outputs of various types to use in various computations. Three different CUDA methods are created to show how each of the types is passed through to the GPU and used by CUDA.
1{
2 "CudaData" : {
3 "version": 1,
4 "categories": "tutorials",
5 "scheduling": ["threadsafe"],
6 "memoryType": "cuda",
7 "description": [
8 "This is a tutorial node. It performs different functions on the GPU to illustrate different types of",
9 "data access. The first adds inputs 'a' and 'b' to yield output 'sum', all of which are on the GPU.",
10 "The second is a sample expansion deformation that multiplies every point on a set of input points,",
11 "stored on the GPU, by a constant value, stored on the CPU, to yield a set of output points, also on the GPU.",
12 "The third is an assortment of different data types illustrating how different data is passed to the GPU.",
13 "This particular node uses CUDA for its GPU computations, as indicated in the memory type value.",
14 "Normal use case for GPU compute is large amounts of data. For testing purposes this node only handles",
15 "a very small amount but the principle is the same."
16 ],
17 "metadata":
18 {
19 "uiName": "Tutorial Node: Attributes With CUDA Data"
20 },
21 "inputs": {
22 "a": {
23 "type": "float",
24 "description": ["First value to be added in algorithm 1"],
25 "default": 0.0
26 },
27 "b": {
28 "type": "float",
29 "description": ["Second value to be added in algorithm 1"],
30 "default": 0.0
31 },
32 "points": {
33 "type": "float[3][]",
34 "description": ["Points to be moved by algorithm 2"],
35 "default": []
36 },
37 "multiplier": {
38 "type": "float[3]",
39 "memoryType": "cpu",
40 "description": ["Amplitude of the expansion for the input points in algorithm 2"],
41 "default": [1.0, 1.0, 1.0]
42 },
43 "half": {
44 "type": "half",
45 "description": ["Input of type half for algorithm 3"],
46 "default": 1.0
47 },
48 "color": {
49 "type": "colord[3]",
50 "description": ["Input with three doubles as a color for algorithm 3"],
51 "default": [1.0, 0.5, 1.0]
52 },
53 "matrix": {
54 "type": "matrixd[4]",
55 "description": ["Input with 16 doubles interpreted as a double-precision 4d matrix"],
56 "default": [[1.0,0.0,0.0,0.0], [0.0,1.0,0.0,0.0], [0.0,0.0,1.0,0.0], [0.0,0.0,0.0,1.0]]
57 }
58 },
59 "outputs": {
60 "sum": {
61 "type": "float",
62 "description": ["Sum of the two inputs from algorithm 1"]
63 },
64 "points": {
65 "type": "float[3][]",
66 "description": ["Final positions of points from algorithm 2"]
67 },
68 "half": {
69 "type": "half",
70 "description": ["Output of type half for algorithm 3"]
71 },
72 "color": {
73 "type": "colord[3]",
74 "description": ["Output with three doubles as a color for algorithm 3"]
75 },
76 "matrix": {
77 "type": "matrixd[4]",
78 "description": ["Output with 16 doubles interpreted as a double-precision 4d matrix"]
79 }
80 },
81 "$why": "By putting inputs but no outputs in tests I can run the compute without failing on extraction",
82 "tests": [
83 {
84 "inputs:a": 1.0, "inputs:b": 2.0,
85 "inputs:half": 1.0,
86 "inputs:color": [0.5, 0.6, 0.7],
87 "inputs:matrix": [[1.0,2.0,3.0,4.0],[2.0,3.0,4.0,5.0],[3.0,4.0,5.0,6.0],[4.0,5.0,6.0,7.0]],
88 "inputs:points": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
89 "inputs:multiplier": [2.0, 3.0, 4.0]
90 }
91 ],
92 "$tests_disabled_until_gpu_data_extraction_works": [
93 { "inputs:a": 1.0, "inputs:b": 2.0, "outputs:sum": 3.0 },
94 { "inputs:a": 5.0, "inputs:b": 3.0, "outputs:sum": 8.0 },
95 {
96 "inputs:half": 1.0, "outputs:half": 2.0,
97 "inputs:color": [0.5, 0.6, 0.7], "outputs:color": [0.5, 0.4, 0.3],
98 "inputs:matrix": [[1.0,2.0,3.0,4.0],[2.0,3.0,4.0,5.0],[3.0,4.0,5.0,6.0],[4.0,5.0,6.0,7.0]],
99 "outputs:matrix": [[30.0,40.0,50.0,60.0],[40.0,54.0,68.0,82.0],[50.0,68.0,86.0,104.0],[60.0,82.0,104.0,126.0]],
100 "inputs:points": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]],
101 "inputs:multiplier": [2.0, 3.0, 4.0],
102 "outputs:points": [[2.0, 6.0, 12.0], [4.0, 9.0, 16.0]]
103 }
104 ]
105 }
106}
OgnTutorialCudaData.cpp
The cpp file contains the implementation of the compute method, which in turn calls the three CUDA algorithms.
1// SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
3//
4// NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
5// property and proprietary rights in and to this material, related
6// documentation and any modifications thereto. Any use, reproduction,
7// disclosure or distribution of this material and related documentation
8// without an express license agreement from NVIDIA CORPORATION or
9// its affiliates is strictly prohibited.
10#include <omni/graph/core/GpuArray.h>
11
12#include <OgnTutorialCudaDataDatabase.h>
13
14// This function exercises referencing simple data types on the GPU, something you normally wouldn't do as
15// there is no efficiency gain in doing that versus just passing in the CPU value.
16extern "C" void applyAddGPU(outputs::sum_t sum, inputs::a_t a, inputs::b_t b);
17
18// This function exercises referencing of array data on the GPU.
19// The CUDA code takes its own "float3" data types, which are castable equivalents to the generated GfVec3f.
20// The GpuArray/ConstGpuArray wrappers isolate the GPU pointers from the CPU code.
21extern "C" void applyDeformationGPU(outputs::points_t outputPoints,
22 inputs::points_t inputPoints,
23 inputs::multiplier_t multiplier,
24 size_t numberOfPoints);
25
26// This function exercises referencing non-standard data types on the GPU to illustrate how data of different
27// types are passed to the GPU.
28extern "C" void applyDataTypes(outputs::half_t halfOutput,
29 outputs::color_t colorOutput,
30 outputs::matrix_t matrixOutput,
31 inputs::half_t halfInput,
32 inputs::color_t colorInput,
33 inputs::matrix_t matrixInput);
34
35// This node runs a couple of algorithms on the GPU, while accessing parameters from the CPU
36class OgnTutorialCudaData
37{
38public:
39 static bool compute(OgnTutorialCudaDataDatabase& db)
40 {
41 // ================ algorithm 1 =========================================================
42 // It's an important distinction here that GPU data is always returned as raw pointers since the CPU code
43 // in the node cannot directly access it. The raw pointers are passed into the GPU code for dereferencing.
44 applyAddGPU(db.outputs.sum(), db.inputs.a(), db.inputs.b());
45
46 // ================ algorithm 2 =========================================================
47 size_t numberOfPoints = db.inputs.points.size();
48 db.outputs.points.resize(numberOfPoints);
49 const auto& multiplier = db.inputs.multiplier();
50 if (numberOfPoints > 0)
51 {
52 applyDeformationGPU(db.outputs.points(), db.inputs.points(), multiplier, numberOfPoints);
53 }
54
55 // ================ algorithm 3 =========================================================
56 applyDataTypes(db.outputs.half(), db.outputs.color(), db.outputs.matrix(), db.inputs.half(), db.inputs.color(),
57 db.inputs.matrix());
58 return true;
59 }
60};
61
62REGISTER_OGN_NODE()
OgnTutorialCudaData_CUDA.cu
The cu file contains the implementation of the algorithms on the GPU using CUDA.
1// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2//
3// NVIDIA CORPORATION and its licensors retain all intellectual property
4// and proprietary rights in and to this software, related documentation
5// and any modifications thereto. Any use, reproduction, disclosure or
6// distribution of this software and related documentation without an express
7// license agreement from NVIDIA CORPORATION is strictly prohibited.
8//
9#include <OgnTutorialCudaDataDatabase.h>
10
11// ======================================================================
12// CUDA compute implementation code will usually have two methods;
13// xxxGPU() - Host function to act as an intermediary between CPU and GPU code
14// xxxCUDA() - CUDA implementation of the actual node algorithm.
15
16// ======================================================================
17// Algorithm 1 support - add two values together.
18// Note how the generated typedefs are used to make declarations easier.
19// On the CUDA side the types are different than what you'd see on the CPU side.
20// e.g. a CPU type of GfVec3f appears in CUDA as float3.
21__global__ void applyAddCUDA(outputs::sum_t output, inputs::a_t a, inputs::b_t b)
22{
23 *output = *a + *b;
24}
25extern "C" void applyAddGPU(outputs::sum_t output, inputs::a_t a, inputs::b_t b)
26{
27 // Launch the GPU code - only a single thread and block is needed since this is a single set of data
28 applyAddCUDA<<<1, 1>>>(output, a, b);
29}
30
31// ======================================================================
32// Algorithm 2 support - apply the multiplier to "inputPoints" to yield "outputPoints"
33__global__ void applyDeformationCUDA(
34 outputs::points_t outputPoints,
35 inputs::points_t inputPoints,
36 inputs::multiplier_t multiplier,
37 size_t numberOfPoints
38)
39{
40 // Make sure the current evaluation block is in range of the available points
41 int currentPointIndex = blockIdx.x * blockDim.x + threadIdx.x;
42 if (numberOfPoints <= currentPointIndex) return;
43
44 // Apply the multiplier to the current points
45 (*outputPoints)[currentPointIndex].x = (*inputPoints)[currentPointIndex].x * multiplier.x;
46 (*outputPoints)[currentPointIndex].y = (*inputPoints)[currentPointIndex].y * multiplier.y;
47 (*outputPoints)[currentPointIndex].z = (*inputPoints)[currentPointIndex].z * multiplier.z;
48}
49// CPU interface called from OgnTutorialCudaData::compute, used to launch the GPU workers.
50// "numberOfPoints" is technically redundant since it's equal to inputPoints.size(), however that call
51// is only available on the __device__ (GPU) side and the value is needed in the calculation of the
52// number of blocks required here on the __host__ (CPU) side so it's better to pass it in.
53extern "C" void applyDeformationGPU(
54 outputs::points_t outputPoints,
55 inputs::points_t inputPoints,
56 inputs::multiplier_t& multiplier, // Using a ref here gives a matching type for CPU-side data
57 size_t numberOfPoints
58)
59{
60 // Split the work into 256 threads, an arbitrary number that could be more precisely tuned when necessary
61 const int numberOfThreads = 256;
62 // Block size is the number of points that fit into each of the threads
63 const int numberOfBlocks = (numberOfPoints + numberOfThreads - 1) / numberOfThreads;
64
65 // Launch the GPU deformation using the calculated number of threads and blocks
66 applyDeformationCUDA<<<numberOfBlocks, numberOfThreads>>>(outputPoints, inputPoints, multiplier, numberOfPoints);
67}
68
69// ======================================================================
70// Algorithm 3 support - simple manipulation of an input to yield the similarly named output.
71// Typedefs are used to make declaration simple - the actual CUDA types are in the comment beside the attribute name.
72__global__ void applyDataTypesCUDA(
73 outputs::half_t halfOutput, // __half
74 outputs::color_t colorOutput, // double3
75 outputs::matrix_t matrixOutput, // Matrix4d
76 inputs::half_t halfInput, // const __half
77 inputs::color_t colorInput, // const double3
78 inputs::matrix_t matrixInput // const Matrix4d
79)
80{
81 // half and color values are doubled
82 *halfOutput = __hadd(*halfInput, *halfInput);
83 *colorOutput = *colorInput + *colorInput;
84 // matrix value is squared
85 *matrixOutput = *matrixInput;
86 *matrixOutput *= *matrixInput;
87}
88extern "C" void applyDataTypes(
89 outputs::half_t halfOutput,
90 outputs::color_t colorOutput,
91 outputs::matrix_t matrixOutput,
92 inputs::half_t halfInput,
93 inputs::color_t colorInput,
94 inputs::matrix_t matrixInput
95)
96{
97 // Launch the GPU code - only a single thread and block is needed since this is a single set of data
98 applyDataTypesCUDA<<<1, 1>>>(halfOutput, colorOutput, matrixOutput, halfInput, colorInput, matrixInput);
99}
GPU Attribute Access
Here is the set of generated attributes from the database. The attributes living on the GPU return pointers to memory as the CPU side cannot dereference it into its actual type (e.g. a float value, which would be returned as a float& on the CPU side is returned instead as a float* on the GPU side.)
In addition, when calling into CUDA code the data changes type as it crosses the GPU boundary. On the CUDA side it uses the CUDA native data types when it can, which are bytewise compatible with their CPU counterparts. Note that in the case of the CPU attribute multiplier the data is passed to the CUDA code by value, since it has to be copied from CPU to GPU.
Database Function |
Is GPU? |
CPU Type |
CUDA Type |
---|---|---|---|
inputs.a() |
Yes |
const float* |
const float* |
inputs.b() |
Yes |
const float* |
const float* |
outputs.sum() |
Yes |
float* |
float* |
inputs.half() |
Yes |
const pxr::GfHalf* |
__half* |
outputs.half() |
Yes |
pxr::GfHalf* |
__half* |
inputs.color() |
Yes |
const GfVec3d* |
const double3* |
outputs.color() |
Yes |
GfVec3d* |
double3* |
inputs.matrix() |
Yes |
const GfMatrix4d* |
const Matrix4d* |
outputs.matrix() |
Yes |
GfMatrix4d* |
Matrix4d* |
inputs.multiplier() |
No |
const GfVec3f& |
const float3 |
inputs.points() |
Yes |
const GfVec3f* |
const float3** |
outputs.points() |
Yes |
GfVec3f* |
float3** |
The array attribute points does not have an array-like wrapper as the CUDA code would rather deal with
raw pointers. In order to provide the size information, when calling the CUDA code the value inputs.points.size()
should also be passed in.
Notice the subtle difference in types on the CPU side for GPU-based data. Instead of references to data there are pointers, necessary since the data lives in a different memory-space, and all pointers have an extra level of indirection for the same reason.
There is also a section of this generated file dedicated to information relevant to the CUDA code. In this section
the CUDA attribute data types are defined.
It is protected with #ifdef __CUDACC__
so that it is only processed when included through the CUDA
compiler (and vice versa, so none of the other setup code will be processed on the CUDA side).
#include <cuda_fp16.h>
#include <omni/graph/core/cuda/CUDAUtils.h>
#include <omni/graph/core/cuda/Matrix4d.h>
namespace OgnTutorialCudaDataCudaTypes
{
namespace inputs
{
using a_t = const float*;
using b_t = const float*;
using points_t = const float3**;
using multiplier_t = const float3*;
using half_t = const __half*;
using color_t = const double3*;
using matrix_t = const Matrix4d*;
}
namespace outputs
{
using sum_t = float*;
using points_t = float3**;
using half_t = __half*;
using color_t = double3*;
using matrix_t = Matrix4d*;
}
}
using namespace OgnTutorialCudaDataCudaTypes;
Notice the inclusion of the file cuda_fp16.h, needed due to the use of the CUDA type __half, and the files omni/graph/core/cuda/CUDAUtils.h and omni/graph/core/cuda/Matrix4d.h, which provide support functions for CUDA math.
The data types used by CUDA are compatible with their equivalents on the C++ side, so you can specify passing arguments into CUDA from C++ by using a declaration such as this on the C++ side:
// In this code "inputs::points_t" is the type "GfVec3f*".
// The size of that array must be passed in as well since it is not implicit in the data type.
extern "C" void cudaCompute(
inputs::points_t*,
size_t pointSize,
inputs::multiplier_t*,
outputs::points_t*
);
which corresponds to this function defined in the .cu file:
// In this code "inputs::points_t" is the type "float3*"
extern "C" void cudaCompute(
inputs::points_t* inPoints,
size_t pointSize,
inputs::multiplier_t* multiplier,
outputs::points_t* outPoints
)
{...}
Pointers are used in the calls rather than being part of the type definitions in order to emphasize the fact that the values passed through are pointers to the real data in the fabric, which in this case is data in GPU memory.