Gecko [ www.corporacionhdm.com.pe ]

Name	Size	Permission	Date
cuda_wrappers	[ DIR ]	drwxr-xr-x	2025-06-02 12:56
llvm_libc_wrappers	[ DIR ]	drwxr-xr-x	2025-06-02 12:56
openmp_wrappers	[ DIR ]	drwxr-xr-x	2025-06-02 12:56
ppc_wrappers	[ DIR ]	drwxr-xr-x	2025-06-02 12:56
__clang_cuda_builtin_vars.h	4.78 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_cmath.h	18.06 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_complex_builtins.h	9.36 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_device_functions.h	56.68 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_intrinsics.h	29.93 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_libdevice_declares.h	21.87 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_math.h	15.99 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_math_forward_declares.h	8.27 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_runtime_wrapper.h	17.61 KB	-rw-r--r--	2023-11-28 08:52
__clang_cuda_texture_intrinsics.h	31.86 KB	-rw-r--r--	2023-11-28 08:52
__clang_hip_cmath.h	26.34 KB	-rw-r--r--	2023-11-28 08:52
__clang_hip_libdevice_declares.h	19.87 KB	-rw-r--r--	2023-11-28 08:52
__clang_hip_math.h	31.96 KB	-rw-r--r--	2023-11-28 08:52
__clang_hip_runtime_wrapper.h	4.65 KB	-rw-r--r--	2023-11-28 08:52
__clang_hip_stdlib.h	1.19 KB	-rw-r--r--	2023-11-28 08:52
__stddef_max_align_t.h	857 B	-rw-r--r--	2023-11-28 08:52
__wmmintrin_aes.h	5.15 KB	-rw-r--r--	2023-11-28 08:52
__wmmintrin_pclmul.h	1.99 KB	-rw-r--r--	2023-11-28 08:52
adxintrin.h	7.37 KB	-rw-r--r--	2023-11-28 08:52
altivec.h	697.32 KB	-rw-r--r--	2023-11-28 08:52
ammintrin.h	7.54 KB	-rw-r--r--	2023-11-28 08:52
amxcomplexintrin.h	6.81 KB	-rw-r--r--	2023-11-28 08:52
amxfp16intrin.h	1.82 KB	-rw-r--r--	2023-11-28 08:52
amxintrin.h	21.12 KB	-rw-r--r--	2023-11-28 08:52
arm64intr.h	993 B	-rw-r--r--	2023-11-28 08:52
arm_acle.h	25.66 KB	-rw-r--r--	2023-11-28 08:52
arm_bf16.h	548 B	-rw-r--r--	2024-11-06 08:03
arm_cde.h	32.67 KB	-rw-r--r--	2024-11-06 08:03
arm_cmse.h	6.21 KB	-rw-r--r--	2023-11-28 08:52
arm_fp16.h	16.92 KB	-rw-r--r--	2024-11-06 08:03
arm_mve.h	1.48 MB	-rw-r--r--	2024-11-06 08:03
arm_neon.h	2.45 MB	-rw-r--r--	2024-11-06 08:03
arm_neon_sve_bridge.h	9.48 KB	-rw-r--r--	2023-11-28 08:52
arm_sme_draft_spec_subject_to_change.h	60.2 KB	-rw-r--r--	2024-11-06 08:03
arm_sve.h	1.51 MB	-rw-r--r--	2024-11-06 08:03
armintr.h	843 B	-rw-r--r--	2023-11-28 08:52
avx2intrin.h	186.96 KB	-rw-r--r--	2023-11-28 08:52
avx512bf16intrin.h	10.51 KB	-rw-r--r--	2023-11-28 08:52
avx512bitalgintrin.h	2.41 KB	-rw-r--r--	2023-11-28 08:52
avx512bwintrin.h	75.33 KB	-rw-r--r--	2023-11-28 08:52
avx512cdintrin.h	4.12 KB	-rw-r--r--	2023-11-28 08:52
avx512dqintrin.h	58.75 KB	-rw-r--r--	2023-11-28 08:52
avx512erintrin.h	11.83 KB	-rw-r--r--	2023-11-28 08:52
avx512fintrin.h	382.64 KB	-rw-r--r--	2023-11-28 08:52
avx512fp16intrin.h	156.63 KB	-rw-r--r--	2023-11-28 08:52
avx512ifmaintrin.h	2.49 KB	-rw-r--r--	2023-11-28 08:52
avx512ifmavlintrin.h	4.31 KB	-rw-r--r--	2023-11-28 08:52
avx512pfintrin.h	4.53 KB	-rw-r--r--	2023-11-28 08:52
avx512vbmi2intrin.h	13.17 KB	-rw-r--r--	2023-11-28 08:52
avx512vbmiintrin.h	3.72 KB	-rw-r--r--	2023-11-28 08:52
avx512vbmivlintrin.h	6.94 KB	-rw-r--r--	2023-11-28 08:52
avx512vlbf16intrin.h	19.21 KB	-rw-r--r--	2023-11-28 08:52
avx512vlbitalgintrin.h	4.23 KB	-rw-r--r--	2023-11-28 08:52
avx512vlbwintrin.h	121.26 KB	-rw-r--r--	2023-11-28 08:52
avx512vlcdintrin.h	7.66 KB	-rw-r--r--	2023-11-28 08:52
avx512vldqintrin.h	46.41 KB	-rw-r--r--	2023-11-28 08:52
avx512vlfp16intrin.h	85.51 KB	-rw-r--r--	2023-11-28 08:52
avx512vlintrin.h	322.29 KB	-rw-r--r--	2023-11-28 08:52
avx512vlvbmi2intrin.h	25.72 KB	-rw-r--r--	2023-11-28 08:52
avx512vlvnniintrin.h	13.13 KB	-rw-r--r--	2023-11-28 08:52
avx512vlvp2intersectintrin.h	4.44 KB	-rw-r--r--	2023-11-28 08:52
avx512vnniintrin.h	4.21 KB	-rw-r--r--	2023-11-28 08:52
avx512vp2intersectintrin.h	2.9 KB	-rw-r--r--	2023-11-28 08:52
avx512vpopcntdqintrin.h	2 KB	-rw-r--r--	2023-11-28 08:52
avx512vpopcntdqvlintrin.h	3.31 KB	-rw-r--r--	2023-11-28 08:52
avxifmaintrin.h	5.75 KB	-rw-r--r--	2023-11-28 08:52
avxintrin.h	195.41 KB	-rw-r--r--	2023-11-28 08:52
avxneconvertintrin.h	14.09 KB	-rw-r--r--	2023-11-28 08:52
avxvnniint16intrin.h	17.41 KB	-rw-r--r--	2023-11-28 08:52
avxvnniint8intrin.h	18.67 KB	-rw-r--r--	2023-11-28 08:52
avxvnniintrin.h	10.44 KB	-rw-r--r--	2023-11-28 08:52
bmi2intrin.h	7.09 KB	-rw-r--r--	2023-11-28 08:52
bmiintrin.h	14.12 KB	-rw-r--r--	2023-11-28 08:52
builtins.h	741 B	-rw-r--r--	2023-11-28 08:52
cet.h	1.49 KB	-rw-r--r--	2023-11-28 08:52
cetintrin.h	3.27 KB	-rw-r--r--	2023-11-28 08:52
cldemoteintrin.h	1.18 KB	-rw-r--r--	2023-11-28 08:52
clflushoptintrin.h	1.17 KB	-rw-r--r--	2023-11-28 08:52
clwbintrin.h	1.2 KB	-rw-r--r--	2023-11-28 08:52
clzerointrin.h	1.19 KB	-rw-r--r--	2023-11-28 08:52
cmpccxaddintrin.h	2.33 KB	-rw-r--r--	2023-11-28 08:52
cpuid.h	11.01 KB	-rw-r--r--	2023-11-28 08:52
crc32intrin.h	3.27 KB	-rw-r--r--	2023-11-28 08:52
emmintrin.h	192.64 KB	-rw-r--r--	2023-11-28 08:52
enqcmdintrin.h	2.12 KB	-rw-r--r--	2023-11-28 08:52
f16cintrin.h	5.39 KB	-rw-r--r--	2023-11-28 08:52
float.h	5.63 KB	-rw-r--r--	2023-11-28 08:52
fma4intrin.h	6.82 KB	-rw-r--r--	2023-11-28 08:52
fmaintrin.h	28.4 KB	-rw-r--r--	2023-11-28 08:52
fxsrintrin.h	2.82 KB	-rw-r--r--	2023-11-28 08:52
gfniintrin.h	7.57 KB	-rw-r--r--	2023-11-28 08:52
hexagon_circ_brev_intrinsics.h	15.59 KB	-rw-r--r--	2023-11-28 08:52
hexagon_protos.h	374.42 KB	-rw-r--r--	2023-11-28 08:52
hexagon_types.h	130.33 KB	-rw-r--r--	2023-11-28 08:52
hresetintrin.h	1.36 KB	-rw-r--r--	2023-11-28 08:52
htmintrin.h	6.14 KB	-rw-r--r--	2023-11-28 08:52
htmxlintrin.h	9.01 KB	-rw-r--r--	2023-11-28 08:52
hvx_hexagon_protos.h	254.26 KB	-rw-r--r--	2023-11-28 08:52
ia32intrin.h	12.72 KB	-rw-r--r--	2023-11-28 08:52
immintrin.h	23.57 KB	-rw-r--r--	2023-11-28 08:52
intrin.h	28.22 KB	-rw-r--r--	2023-11-28 08:52
inttypes.h	2.26 KB	-rw-r--r--	2023-11-28 08:52
invpcidintrin.h	764 B	-rw-r--r--	2023-11-28 08:52
iso646.h	656 B	-rw-r--r--	2023-11-28 08:52
keylockerintrin.h	17.98 KB	-rw-r--r--	2023-11-28 08:52
larchintrin.h	7.8 KB	-rw-r--r--	2023-11-28 08:52
limits.h	3.61 KB	-rw-r--r--	2023-11-28 08:52
lwpintrin.h	5 KB	-rw-r--r--	2023-11-28 08:52
lzcntintrin.h	3.18 KB	-rw-r--r--	2023-11-28 08:52
mm3dnow.h	4.5 KB	-rw-r--r--	2023-11-28 08:52
mm_malloc.h	1.88 KB	-rw-r--r--	2023-11-28 08:52
mmintrin.h	55.98 KB	-rw-r--r--	2023-11-28 08:52
module.modulemap	3.33 KB	-rw-r--r--	2023-11-28 08:52
movdirintrin.h	1.57 KB	-rw-r--r--	2023-11-28 08:52
msa.h	25.01 KB	-rw-r--r--	2023-11-28 08:52
mwaitxintrin.h	2.19 KB	-rw-r--r--	2023-11-28 08:52
nmmintrin.h	709 B	-rw-r--r--	2023-11-28 08:52
opencl-c-base.h	30.38 KB	-rw-r--r--	2023-11-28 08:52
opencl-c.h	874.39 KB	-rw-r--r--	2023-11-28 08:52
pconfigintrin.h	1.19 KB	-rw-r--r--	2023-11-28 08:52
pkuintrin.h	934 B	-rw-r--r--	2023-11-28 08:52
pmmintrin.h	10.5 KB	-rw-r--r--	2023-11-28 08:52
popcntintrin.h	1.82 KB	-rw-r--r--	2023-11-28 08:52
prfchiintrin.h	2.02 KB	-rw-r--r--	2023-11-28 08:52
prfchwintrin.h	2.06 KB	-rw-r--r--	2023-11-28 08:52
ptwriteintrin.h	1.05 KB	-rw-r--r--	2023-11-28 08:52
raointintrin.h	6.59 KB	-rw-r--r--	2023-11-28 08:52
rdpruintrin.h	1.59 KB	-rw-r--r--	2023-11-28 08:52
rdseedintrin.h	2.85 KB	-rw-r--r--	2023-11-28 08:52
riscv_ntlh.h	855 B	-rw-r--r--	2023-11-28 08:52
rtmintrin.h	1.25 KB	-rw-r--r--	2023-11-28 08:52
s390intrin.h	604 B	-rw-r--r--	2023-11-28 08:52
serializeintrin.h	881 B	-rw-r--r--	2023-11-28 08:52
sgxintrin.h	1.77 KB	-rw-r--r--	2023-11-28 08:52
sha512intrin.h	5.95 KB	-rw-r--r--	2023-11-28 08:52
shaintrin.h	7.37 KB	-rw-r--r--	2023-11-28 08:52
sifive_vector.h	522 B	-rw-r--r--	2023-11-28 08:52
sm3intrin.h	7.29 KB	-rw-r--r--	2023-11-28 08:52
sm4intrin.h	8.2 KB	-rw-r--r--	2023-11-28 08:52
smmintrin.h	99.32 KB	-rw-r--r--	2023-11-28 08:52
stdalign.h	911 B	-rw-r--r--	2023-11-28 08:52
stdarg.h	1.66 KB	-rw-r--r--	2023-11-28 08:52
stdatomic.h	8.3 KB	-rw-r--r--	2023-11-28 08:52
stdbool.h	1.04 KB	-rw-r--r--	2023-11-28 08:52
stddef.h	4.16 KB	-rw-r--r--	2023-11-28 08:52
stdint.h	32.49 KB	-rw-r--r--	2023-11-28 08:52
stdnoreturn.h	1.17 KB	-rw-r--r--	2023-11-28 08:52
tbmintrin.h	3.15 KB	-rw-r--r--	2023-11-28 08:52
tgmath.h	29.68 KB	-rw-r--r--	2023-11-28 08:52
tmmintrin.h	29.51 KB	-rw-r--r--	2023-11-28 08:52
tsxldtrkintrin.h	1.97 KB	-rw-r--r--	2023-11-28 08:52
uintrintrin.h	4.96 KB	-rw-r--r--	2023-11-28 08:52
unwind.h	11.21 KB	-rw-r--r--	2023-11-28 08:52
vadefs.h	1.39 KB	-rw-r--r--	2023-11-28 08:52
vaesintrin.h	2.46 KB	-rw-r--r--	2023-11-28 08:52
varargs.h	477 B	-rw-r--r--	2023-11-28 08:52
vecintrin.h	360.82 KB	-rw-r--r--	2023-11-28 08:52
velintrin.h	2.1 KB	-rw-r--r--	2023-11-28 08:52
velintrin_approx.h	3.54 KB	-rw-r--r--	2023-11-28 08:52
velintrin_gen.h	69.06 KB	-rw-r--r--	2023-11-28 08:52
vpclmulqdqintrin.h	1.06 KB	-rw-r--r--	2023-11-28 08:52
waitpkgintrin.h	1.33 KB	-rw-r--r--	2023-11-28 08:52
wasm_simd128.h	76.25 KB	-rw-r--r--	2023-11-28 08:52
wbnoinvdintrin.h	749 B	-rw-r--r--	2023-11-28 08:52
wmmintrin.h	659 B	-rw-r--r--	2023-11-28 08:52
x86gprintrin.h	2.32 KB	-rw-r--r--	2023-11-28 08:52
x86intrin.h	1.81 KB	-rw-r--r--	2023-11-28 08:52
xmmintrin.h	106.73 KB	-rw-r--r--	2023-11-28 08:52
xopintrin.h	19.96 KB	-rw-r--r--	2023-11-28 08:52
xsavecintrin.h	2.51 KB	-rw-r--r--	2023-11-28 08:52
xsaveintrin.h	1.64 KB	-rw-r--r--	2023-11-28 08:52
xsaveoptintrin.h	1 KB	-rw-r--r--	2023-11-28 08:52
xsavesintrin.h	1.24 KB	-rw-r--r--	2023-11-28 08:52
xtestintrin.h	873 B	-rw-r--r--	2023-11-28 08:52

Rename

/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===------------------------------------------------------------------------===
 */

#ifndef __IMMINTRIN_H
#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
#endif /* __IMMINTRIN_H */

#ifndef __AMXINTRIN_H
#define __AMXINTRIN_H
#ifdef __x86_64__

/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS_TILE                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
#define __DEFAULT_FN_ATTRS_INT8                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
#define __DEFAULT_FN_ATTRS_BF16                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
#define __DEFAULT_FN_ATTRS_FP16                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))

/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
/// number of bytes per row, and the number of rows. If the specified
/// palette_id is zero, that signifies the init state for both the tile
/// config and the tile data, and the tiles are zeroed. Any invalid
/// configurations will result in #GP fault.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
///
/// \param __config
///    A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void *__config) {
  __builtin_ia32_tile_loadconfig(__config);
}

/// Stores the current tile configuration to a 64-byte memory location
/// specified by "mem_addr". The tile configuration includes the tile type
/// palette, the number of bytes per row, and the number of rows. If tiles
/// are not configured, all zeroes will be stored to memory.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
///
/// \param __config
///    A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void *__config) {
  __builtin_ia32_tile_storeconfig(__config);
}

/// Release the tile configuration to return to the init state, which
/// releases all storage it currently holds.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
  __builtin_ia32_tilerelease();
}

/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
/// via "_tile_loadconfig".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
/// \param dst
///    A destination tile. Max size is 1024 Bytes.
/// \param base
///    A pointer to base address.
/// \param stride
///    The stride between the rows' data to be loaded in memory.
#define _tile_loadd(dst, base, stride)                                         \
  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
                             (__SIZE_TYPE__)(stride))

/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
/// \param dst
///    A destination tile. Max size is 1024 Bytes.
/// \param base
///    A pointer to base address.
/// \param stride
///    The stride between the rows' data to be loaded in memory.
#define _tile_stream_loadd(dst, base, stride)                                  \
  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
                               (__SIZE_TYPE__)(stride))

/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride" using the tile configuration previously configured via
/// "_tile_loadconfig".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
/// \param dst
///    A destination tile. Max size is 1024 Bytes.
/// \param base
///    A pointer to base address.
/// \param stride
///    The stride between the rows' data to be stored in memory.
#define _tile_stored(dst, base, stride)                                        \
  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))

/// Zero the tile specified by "tdest".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
/// \param tile
///    The destination tile to be zero. Max size is 1024 Bytes.
#define _tile_zero(tile) __builtin_ia32_tilezero((tile))

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbssd(dst, src0, src1)                                          \
  __builtin_ia32_tdpbssd((dst), (src0), (src1))

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbsud(dst, src0, src1)                                          \
  __builtin_ia32_tdpbsud((dst), (src0), (src1))

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbusd(dst, src0, src1)                                          \
  __builtin_ia32_tdpbusd((dst), (src0), (src1))

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbuud(dst, src0, src1)                                          \
  __builtin_ia32_tdpbuud((dst), (src0), (src1))

/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbf16ps(dst, src0, src1)                                        \
  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))

/// AMX tile register size can be configured, the maximum size is 16x64=1024
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
/// represent 2D tile and the fixed size is maximum amx tile register size.
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                     __SIZE_TYPE__ stride) {
  return __builtin_ia32_tileloadd64_internal(m, n, base,
                                             (__SIZE_TYPE__)(stride));
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
                       __SIZE_TYPE__ stride) {
  return __builtin_ia32_tileloaddt164_internal(m, n, base,
                                               (__SIZE_TYPE__)(stride));
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
                      __SIZE_TYPE__ stride, _tile1024i tile) {
  return __builtin_ia32_tilestored64_internal(m, n, base,
                                              (__SIZE_TYPE__)(stride), tile);
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
}

/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
/// on the shape information to do configure. The constant value is preferred
/// for optimization by compiler.
typedef struct __tile1024i_str {
  const unsigned short row;
  const unsigned short col;
  _tile1024i tile;
} __tile1024i;

/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
/// \param dst
///    A destination tile. Max size is 1024 Bytes.
/// \param base
///    A pointer to base address.
/// \param stride
///    The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
                                    __SIZE_TYPE__ stride) {
  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}

/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
/// \param dst
///    A destination tile. Max size is 1024 Bytes.
/// \param base
///    A pointer to base address.
/// \param stride
///    The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
                                           __SIZE_TYPE__ stride) {
  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
}

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
}

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
}

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
}

/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
}

/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
/// \param base
///    A pointer to base address.
/// \param stride
///    The stride between the rows' data to be stored in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
                                     __tile1024i src) {
  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
}

/// Zero the tile specified by "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
/// \param dst
///    The destination tile to be zero. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_zero(__tile1024i *dst) {
  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
}

/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_BF16
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
                                       __tile1024i src1) {
  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
                                      src0.tile, src1.tile);
}

/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
///
/// \param dst
///    The destination tile. Max size is 1024 Bytes.
/// \param src0
///    The 1st source tile. Max size is 1024 Bytes.
/// \param src1
///    The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_FP16
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
                                       __tile1024i src1) {
  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
                                      src0.tile, src1.tile);
}

#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#undef __DEFAULT_FN_ATTRS_BF16
#undef __DEFAULT_FN_ATTRS_FP16

#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */