Tutorial 27 - GPU Data Node with CPU Array Pointers
The GPU data node illustrates the alternative method of extracting array data from the GPU by returning a CPU pointer to the GPU array. Normally the data returns a GPU pointer to an array of GPU pointers, optimized for future use in parallel processing of GPU array data. By returning a CPU pointer to the array you can use host-side processing to dereference the pointers.
OgnTutorialCudaDataCpu.ogn
The ogn file shows the implementation of a node named “omni.graph.tutorials.CudaCpuArrays”, which has an input and an output of type float[3][], along with the special keyword to indicate that the pointer to the CUDA arrays should be in CPU space.
1{
2 "CudaCpuArrays" : {
3 "version": 1,
4 "memoryType": "cuda",
5 "cudaPointers": "cpu",
6 "description": [
7 "This is a tutorial node. It illustrates the alternative method of extracting pointers to GPU array data",
8 "in which the pointer returned is a CPU pointer and can be dereferenced on the CPU side. Without the",
9 "cudaPointers value set that pointer would be a GPU pointer to an array of GPU pointers and could",
10 "only be dereferenced on the device."
11 ],
12 "metadata":
13 {
14 "uiName": "Tutorial Node: Attributes With CUDA Array Pointers In Cpu Memory"
15 },
16 "categories": "tutorials",
17 "inputs": {
18 "points": {
19 "type": "float[3][]",
20 "memoryType": "any",
21 "description": ["Array of points to be moved"],
22 "default": []
23 },
24 "multiplier": {
25 "type": "float[3]",
26 "description": ["Amplitude of the expansion for the input points"],
27 "default": [1.0, 1.0, 1.0]
28 }
29 },
30 "outputs": {
31 "points": {
32 "type": "float[3][]",
33 "description": ["Final positions of points"]
34 }
35 },
36 "tests": [
37 {
38 "inputs:multiplier": [1.0, 2.0, 3.0],
39 "inputs:points": [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [3.0, 3.0, 3.0]],
40 "outputs:points": [[1.0, 2.0, 3.0], [2.0, 4.0, 6.0], [3.0, 6.0, 9.0]]
41 }
42 ]
43 }
44}
OgnTutorialCudaDataCpu.cpp
The cpp file contains the implementation of the compute method, which in turn calls the CUDA algorithm.
1// SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
3//
4// NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
5// property and proprietary rights in and to this material, related
6// documentation and any modifications thereto. Any use, reproduction,
7// disclosure or distribution of this material and related documentation
8// without an express license agreement from NVIDIA CORPORATION or
9// its affiliates is strictly prohibited.
10#include <omni/graph/core/GpuArray.h>
11
12#include <OgnTutorialCudaDataCpuDatabase.h>
13
14// This function exercises referencing of array data on the GPU.
15// The CUDA code takes its own "float3" data types, which are castable equivalents to the generated GfVec3f.
16// The GpuArray/ConstGpuArray wrappers isolate the GPU pointers from the CPU code.
17extern "C" void applyDeformationCpuToGpu(pxr::GfVec3f* outputPoints,
18 const pxr::GfVec3f* inputPoints,
19 const pxr::GfVec3f* multiplier,
20 size_t numberOfPoints);
21
22// This node runs a couple of algorithms on the GPU, while accessing parameters from the CPU
23class OgnTutorialCudaDataCpu
24{
25public:
26 static bool compute(OgnTutorialCudaDataCpuDatabase& db)
27 {
28 size_t numberOfPoints = db.inputs.points.size();
29 db.outputs.points.resize(numberOfPoints);
30 if (numberOfPoints > 0)
31 {
32 // The main point to note here is how the pointer can be dereferenced on the CPU side, whereas normally
33 // you would have to send it to the GPU for dereferencing. (The long term purpose of the latter is to make
34 // it more efficient to handle arrays-of-arrays on the GPU, however since that is not yet implemented
35 // we can get away with a single dereference here.)
36 applyDeformationCpuToGpu(
37 *db.outputs.points(), *db.inputs.points.gpu(), db.inputs.multiplier(), numberOfPoints);
38
39 // Just as a test now also reference the points as CPU data to ensure the value casts correctly
40 const float* pointsAsCpu = reinterpret_cast<const float*>(db.inputs.points.cpu().data());
41 if (!pointsAsCpu)
42 {
43 db.logWarning("Points could not be copied to the CPU");
44 return false;
45 }
46 }
47
48 return true;
49 }
50};
51
52REGISTER_OGN_NODE()
OgnTutorialCudaDataCpu_CUDA.cu
The cu file contains the implementation of the algorithm on the GPU using CUDA.
1// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
2//
3// NVIDIA CORPORATION and its licensors retain all intellectual property
4// and proprietary rights in and to this software, related documentation
5// and any modifications thereto. Any use, reproduction, disclosure or
6// distribution of this software and related documentation without an express
7// license agreement from NVIDIA CORPORATION is strictly prohibited.
8//
9#include <OgnTutorialCudaDataCpuDatabase.h>
10
11// ======================================================================
12// Apply the multiplier to "inputPoints" to yield "outputPoints"
13__global__ void applyDeformationCUDA(
14 float3* outputPoints,
15 const float3* inputPoints,
16 const float3* multiplier,
17 size_t numberOfPoints
18)
19{
20 // Make sure the current evaluation block is in range of the available points
21 int currentPointIndex = blockIdx.x * blockDim.x + threadIdx.x;
22 if (numberOfPoints <= currentPointIndex) return;
23
24 // Apply the multiplier to the current points
25 outputPoints[currentPointIndex].x = inputPoints[currentPointIndex].x * multiplier->x;
26 outputPoints[currentPointIndex].y = inputPoints[currentPointIndex].y * multiplier->y;
27 outputPoints[currentPointIndex].z = inputPoints[currentPointIndex].z * multiplier->z;
28}
29// CPU interface called from OgnTutorialCudaDataCpu::compute, used to launch the GPU workers.
30// "numberOfPoints" is technically redundant since it's equal to inputPoints.size(), however that call
31// is only available on the __device__ (GPU) side and the value is needed in the calculation of the
32// number of blocks required here on the __host__ (CPU) side so it's better to pass it in.
33extern "C" void applyDeformationCpuToGpu(
34 float3* outputPoints,
35 const float3* inputPoints,
36 const float3* multiplier,
37 size_t numberOfPoints
38)
39{
40 // Split the work into 256 threads, an arbitrary number that could be more precisely tuned when necessary
41 const int numberOfThreads = 256;
42 // Block size is the number of points that fit into each of the threads
43 const int numberOfBlocks = (numberOfPoints + numberOfThreads - 1) / numberOfThreads;
44
45 // Launch the GPU deformation using the calculated number of threads and blocks
46 applyDeformationCUDA<<<numberOfBlocks, numberOfThreads>>>(outputPoints, inputPoints, multiplier, numberOfPoints);
47}
OgnTutorialCudaDataCpuPy.py
The py file contains the implementation of the compute method, which for this example doesn’t actually compute as extra extension support is required for Python to run on the GPU (e.g. a Python -> CUDA compiler).
1"""
2Implementation of the Python node accessing CUDA attributes in a way that accesses the GPU arrays with a CPU pointer.
3
4No actual computation is done here as the tutorial nodes are not set up to handle GPU computation.
5"""
6
7import ctypes
8
9import omni.graph.core as og
10
11# Only one type of data is handled by the compute - pointf[3][]
12POINT_ARRAY_TYPE = og.Type(og.BaseDataType.FLOAT, tuple_count=3, array_depth=1, role=og.AttributeRole.POSITION)
13
14
15def get_address(attr: og.Attribute) -> int:
16 """Returns the contents of the memory the attribute points to"""
17 if attr.memory == 0:
18 return 0
19 ptr_type = ctypes.POINTER(ctypes.c_size_t)
20 ptr = ctypes.cast(attr.memory, ptr_type)
21 return ptr.contents.value
22
23
24class OgnTutorialCudaDataCpuPy:
25 """Exercise GPU access for extended attributes through a Python OmniGraph node"""
26
27 @staticmethod
28 def compute(db) -> bool:
29 """Accesses the CUDA data, which for arrays exists as wrappers around CPU memory pointers to GPU pointer arrays.
30 No compute is done here.
31 """
32
33 # Put accessors into local variables for convenience
34 input_points = db.inputs.points
35 multiplier = db.inputs.multiplier
36
37 # Set the size to what is required for the multiplication - this can be done without accessing GPU data.
38 # Notice that since this is CPU pointers to GPU data the size has to be taken from the data type description
39 # rather than the usual method of taking len(input_points).
40 db.outputs.points_size = input_points.dtype.size
41
42 # After changing the size the memory isn't allocated immediately (when necessary). It is delayed until you
43 # request access to it, which is what this line will do.
44 output_points = db.outputs.points
45
46 # This is a separate test to add a points attribute to the output bundle to show how when a bundle has
47 # CPU pointers to the GPU data that information propagates to its children
48 # Start with an empty output bundle.
49 output_bundle = db.outputs.outBundle
50 output_bundle.clear()
51 output_bundle.add_attributes([og.Type(og.BaseDataType.FLOAT, 3, 1)], ["points"])
52 bundle_attr = output_bundle.attribute_by_name("points")
53 # As for the main attributes, setting the bundle member size readies the buffer of the given size on the GPU
54 bundle_attr.size = input_points.dtype.size
55
56 # The output cannot be written to here through the normal assignment mechanisms, e.g. the typical step of
57 # copying input points to the output points, as the data is not accessible on the GPU through Python directly.
58 # Instead you can access the GPU memory pointers through the attribute values and send it to CUDA code, either
59 # generated from the Python code or accessed through something like pybind wrappers.
60
61 print("Locations in CUDA() should be in GPU memory space")
62 print(f" CPU Location for reference = {hex(id(db))}", flush=True)
63 print(f" Input points are {input_points} at CUDA({hex(get_address(input_points))})", flush=True)
64 print(f" Multiplier is CUDA({multiplier})", flush=True)
65 print(f" Output points are {output_points} at CUDA({hex(get_address(output_points))})", flush=True)
66 print(f" Bundle {bundle_attr.gpu_value} at CUDA({hex(get_address(bundle_attr.gpu_value))})", flush=True)
67
68 return True