Tutorial 30 - Node with more advanced computeVectorized
This tutorial demonstrates how to compose nodes that implements a computeVectorized function. It shows how to access the raw vectorized data, and how it can be used to write a performant tight loop using SIMD instructions.
OgnTutorialSIMDAdd.ogn
The ogn file shows the implementation of a node named “omni.graph.tutorials.TutorialSIMDFloatAdd”, which takes inputs of 2 floating point values, and performs a sum.
1{
2 "TutorialSIMDFloatAdd": {
3 "version": 1,
4 "description": "Add 2 floats together using SIMD instruction set",
5 "categories": "tutorials",
6 "uiName": "Tutorial Node: SIMD Add",
7 "inputs": {
8 "a": {
9 "type": "float",
10 "description": "first input operand"
11 },
12 "b": {
13 "type": "float",
14 "description": "second input operand"
15 }
16 },
17 "outputs": {
18 "result": {
19 "type": "float",
20 "description": "the sum of a and b"
21 }
22 }
23 }
24}
OgnTutorialSIMDAdd.cpp
The cpp file contains the implementation of the node. It takes two floating point inputs and performs a sum, demonstrating how to handle a vectorized compute. It shows how to retrieve the vectorized array of inputs and output, how to reason about the number of instances provided, and how to optimize the compute taking advantage of those vectorized inputs. Since a SIMD instruction requires a given alignment for its arguments, the compute is divided in 3 sections: - a first section that does a regular sum input on the few first instances that don’t have a proper alignment - a second, the heart of the function, that does as much SIMD adds as it can, performing them 4 elements by 4 elements - a last section that perform regular sum on the few remaining items that did not fit in the SIMD register
1// SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
3//
4// NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
5// property and proprietary rights in and to this material, related
6// documentation and any modifications thereto. Any use, reproduction,
7// disclosure or distribution of this material and related documentation
8// without an express license agreement from NVIDIA CORPORATION or
9// its affiliates is strictly prohibited.
10
11#if !defined(__arm__) && !defined(__aarch64__)
12# define SIMD_AVAILABLE
13#endif
14
15#include <OgnTutorialSIMDAddDatabase.h>
16
17#ifdef SIMD_AVAILABLE
18# include <immintrin.h>
19#endif
20
21// This node perform a sum using SIMD instruction set
22class OgnTutorialSIMDAdd
23{
24public:
25 static size_t computeVectorized(OgnTutorialSIMDAddDatabase& db, size_t count)
26 {
27 // Retrieve data
28 auto opA = db.inputs.a.vectorized(count);
29 auto opB = db.inputs.b.vectorized(count);
30 auto res = db.outputs.result.vectorized(count);
31
32 // Regular loop definition
33 auto regularLoop = [&](size_t const begin, size_t const end) -> size_t const
34 {
35 for (size_t idx = begin; idx < end; ++idx)
36 res[idx] = opA[idx] + opB[idx];
37 return end;
38 };
39
40#ifdef SIMD_AVAILABLE
41
42 // Constants
43 static size_t constexpr kSIMDSize = sizeof(__m128);
44 static size_t constexpr kMask = kSIMDSize - 1;
45 static size_t constexpr kSIMDFloatCount = kSIMDSize / sizeof(float);
46
47 // Alignment must be identical
48 bool const correctlyAligned = ((size_t(opA.data()) & kMask) == (size_t(opB.data()) & kMask)) &&
49 ((size_t(opA.data()) & kMask) == (size_t(res.data()) & kMask));
50
51 if (!correctlyAligned)
52 {
53 regularLoop(0, count);
54 }
55 else
56 {
57 // Unaligned elements
58 size_t const maskedAddress = (size_t(res.data()) & kMask);
59 size_t const unalignedCount = maskedAddress ? regularLoop(0, (kSIMDSize - maskedAddress) / sizeof(float)) : 0;
60
61 // Vectorized elements
62 size_t const vectorizedCount = (count - unalignedCount) & (~kMask);
63 size_t const vectorizedLoop = vectorizedCount / kSIMDFloatCount;
64
65 __m128* aSIMD = (__m128*)(opA.data() + unalignedCount);
66 __m128* bSIMD = (__m128*)(opB.data() + unalignedCount);
67 __m128* resSIMD = (__m128*)(res.data() + unalignedCount);
68 for (size_t idx = 0; idx < vectorizedLoop; ++idx)
69 resSIMD[idx] = _mm_add_ps(aSIMD[idx], bSIMD[idx]);
70
71 // Remaining elements
72 regularLoop(unalignedCount + vectorizedCount, count);
73 }
74
75#else
76
77 regularLoop(0, count);
78
79#endif
80
81 return count;
82 }
83};
84
85REGISTER_OGN_NODE()