Tutorial 30 - Node with more advanced computeVectorized

This tutorial demonstrates how to compose nodes that implements a computeVectorized function. It shows how to access the raw vectorized data, and how it can be used to write a performant tight loop using SIMD instructions.

OgnTutorialSIMDAdd.ogn

The ogn file shows the implementation of a node named “omni.graph.tutorials.TutorialSIMDFloatAdd”, which takes inputs of 2 floating point values, and performs a sum.

{
    "TutorialSIMDFloatAdd": {
        "version": 1,
        "description": "Add 2 floats together using SIMD instruction set",
        "categories": "tutorials",
        "uiName": "Tutorial Node: SIMD Add",
        "inputs": {
            "a": {
                "type": "float",
                "description": "first input operand"
            },
            "b": {
                "type": "float",
                "description": "second input operand"
            }
        },
        "outputs": {
            "result": {
                "type": "float",
                "description": "the sum of a and b"
            }
        }
    }
}

OgnTutorialSIMDAdd.cpp

The cpp file contains the implementation of the node. It takes two floating point inputs and performs a sum, demonstrating how to handle a vectorized compute. It shows how to retrieve the vectorized array of inputs and output, how to reason about the number of instances provided, and how to optimize the compute taking advantage of those vectorized inputs. Since a SIMD instruction requires a given alignment for its arguments, the compute is divided in 3 sections: - a first section that does a regular sum input on the few first instances that don’t have a proper alignment - a second, the heart of the function, that does as much SIMD adds as it can, performing them 4 elements by 4 elements - a last section that perform regular sum on the few remaining items that did not fit in the SIMD register

// SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: LicenseRef-NvidiaProprietary
//
// NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
// property and proprietary rights in and to this material, related
// documentation and any modifications thereto. Any use, reproduction,
// disclosure or distribution of this material and related documentation
// without an express license agreement from NVIDIA CORPORATION or
// its affiliates is strictly prohibited.

#if !defined(__arm__) && !defined(__aarch64__)
#    define SIMD_AVAILABLE
#endif

#include <OgnTutorialSIMDAddDatabase.h>

#ifdef SIMD_AVAILABLE
#    include <immintrin.h>
#endif

// This node perform a sum using SIMD instruction set
class OgnTutorialSIMDAdd
{
public:
    static size_t computeVectorized(OgnTutorialSIMDAddDatabase& db, size_t count)
    {
        // Retrieve data
        auto opA = db.inputs.a.vectorized(count);
        auto opB = db.inputs.b.vectorized(count);
        auto res = db.outputs.result.vectorized(count);

        // Regular loop definition
        auto regularLoop = [&](size_t const begin, size_t const end) -> size_t const
        {
            for (size_t idx = begin; idx < end; ++idx)
                res[idx] = opA[idx] + opB[idx];
            return end;
        };

#ifdef SIMD_AVAILABLE

        // Constants
        static size_t constexpr kSIMDSize = sizeof(__m128);
        static size_t constexpr kMask = kSIMDSize - 1;
        static size_t constexpr kSIMDFloatCount = kSIMDSize / sizeof(float);

        // Alignment must be identical
        bool const correctlyAligned = ((size_t(opA.data()) & kMask) == (size_t(opB.data()) & kMask)) &&
                                      ((size_t(opA.data()) & kMask) == (size_t(res.data()) & kMask));

        if (!correctlyAligned)
        {
            regularLoop(0, count);
        }
        else
        {
            // Unaligned elements
            size_t const maskedAddress = (size_t(res.data()) & kMask);
            size_t const unalignedCount = maskedAddress ? regularLoop(0, (kSIMDSize - maskedAddress) / sizeof(float)) : 0;

            // Vectorized elements
            size_t const vectorizedCount = (count - unalignedCount) & (~kMask);
            size_t const vectorizedLoop = vectorizedCount / kSIMDFloatCount;

            __m128* aSIMD = (__m128*)(opA.data() + unalignedCount);
            __m128* bSIMD = (__m128*)(opB.data() + unalignedCount);
            __m128* resSIMD = (__m128*)(res.data() + unalignedCount);
            for (size_t idx = 0; idx < vectorizedLoop; ++idx)
                resSIMD[idx] = _mm_add_ps(aSIMD[idx], bSIMD[idx]);

            // Remaining elements
            regularLoop(unalignedCount + vectorizedCount, count);
        }

#else

        regularLoop(0, count);

#endif

        return count;
    }
};

REGISTER_OGN_NODE()