Spaces:

YZ-TAN
/

flask-llama

Build error

App Files Files Community

flask-llama / llama.cpp /ggml /src /ggml-sycl /common.hpp

YZ-TAN

Upload 2821 files

5a29263 verified 11 months ago

raw

history blame

24.2 kB

	//
	// MIT license
	// Copyright (C) 2024 Intel Corporation
	// SPDX-License-Identifier: MIT
	//

	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//

	#ifndef GGML_SYCL_COMMON_HPP
	#define GGML_SYCL_COMMON_HPP

	#include <fstream>
	#include <iostream>

	#include "dpct/helper.hpp"
	#include "ggml-sycl.h"
	#include "presets.hpp"
	#if GGML_SYCL_DNNL
	#include "dnnl.hpp"
	#include "dnnl_sycl.hpp"
	#endif

	#define GGML_COMMON_DECL_SYCL
	#define GGML_COMMON_IMPL_SYCL
	/* suppress warning spam */
	#pragma clang diagnostic push
	#pragma clang diagnostic ignored "-Wnested-anon-types"
	#include "ggml-common.h"
	#pragma clang diagnostic pop

	void* ggml_sycl_host_malloc(size_t size);
	void ggml_sycl_host_free(void* ptr);

	static int g_ggml_sycl_debug = 0;
	#define GGML_SYCL_DEBUG(...) \
	do { \
	if (g_ggml_sycl_debug) \
	fprintf(stderr, __VA_ARGS__); \
	} while (0)

	#define CHECK_TRY_ERROR(expr) \
	[&]() { \
	try { \
	expr; \
	return dpct::success; \
	} catch (std::exception const& e) { \
	std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
	<< ", line:" << __LINE__ << ", func:" << __func__ \
	<< std::endl; \
	return dpct::default_error; \
	} \
	}()


	#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
	#define VER_4VEC 610 // todo for hardward optimize.
	#define VER_GEN9 700 // todo for hardward optimize.
	#define VER_GEN12 1000000 // todo for hardward optimize.
	#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.

	#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares

	// define for XMX in Intel GPU
	// TODO: currently, it's not used for XMX really.
	#if !defined(GGML_SYCL_FORCE_MMQ)
	#define SYCL_USE_XMX
	#endif

	// max batch size to use MMQ kernels when tensor cores are available
	#define MMQ_MAX_BATCH_SIZE 32

	#if defined(_MSC_VER)
	#pragma warning(disable : 4244 4267) // possible loss of data
	#endif

	// dmmv = dequantize_mul_mat_vec
	#ifndef GGML_SYCL_DMMV_X
	#define GGML_SYCL_DMMV_X 32
	#endif
	#ifndef GGML_SYCL_MMV_Y
	#define GGML_SYCL_MMV_Y 1
	#endif

	typedef sycl::queue *queue_ptr;

	enum ggml_sycl_backend_gpu_mode {
	SYCL_UNSET_GPU_MODE = -1,
	SYCL_SINGLE_GPU_MODE = 0,
	SYCL_MUL_GPU_MODE
	};

	static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");

	static void crash() {
	int* ptr = NULL;
	*ptr = 0;
	}

	[[noreturn]] static void ggml_sycl_error(
	const char* stmt,
	const char* func,
	const char* file,
	const int line,
	const char* msg) {
	fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
	fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
	GGML_ABORT("SYCL error");
	}

	#define SYCL_CHECK(err) \
	do { \
	auto err_ = (err); \
	if (err_ != 0) \
	ggml_sycl_error( \
	#err, \
	__func__, \
	__FILE__, \
	__LINE__, \
	"Meet error in this line code!"); \
	} while (0)

	#if DPCT_COMPAT_RT_VERSION >= 11100
	#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
	#else
	#define GGML_SYCL_ASSUME(x)
	#endif // DPCT_COMPAT_RT_VERSION >= 11100

	#ifdef GGML_SYCL_F16
	typedef sycl::half dfloat; // dequantize float
	typedef sycl::half2 dfloat2;
	#else
	typedef float dfloat; // dequantize float
	typedef sycl::float2 dfloat2;
	#endif // GGML_SYCL_F16

	#define MMVQ_MAX_BATCH_SIZE 8

	static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

	static int g_all_sycl_device_count = -1;
	static bool g_ggml_backend_sycl_buffer_type_initialized = false;

	static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
	SYCL_UNSET_GPU_MODE;

	static void* g_scratch_buffer = nullptr;
	static size_t g_scratch_size = 0; // disabled by default
	static size_t g_scratch_offset = 0;

	[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
	stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
	"current GPU architecture.\n";
	// __trap();
	std::exit(1);

	(void)bad_arch; // suppress unused function warning
	}

	int get_current_device_id();

	inline dpct::err0 ggml_sycl_set_device(const int device) try {

	int current_device_id;
	SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));

	// GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
	// current_device_id=%d\n", device, current_device);
	if (device == current_device_id) {
	return 0;
	}

	return CHECK_TRY_ERROR(dpct::select_device(device));
	} catch (sycl::exception const& exc) {
	std::cerr << exc.what() << "Exception caught at file:" << __FILE__
	<< ", line:" << __LINE__ << std::endl;
	crash();
	std::exit(1);
	}

	//////////////////////

	struct ggml_sycl_device_info {
	int device_count;

	struct sycl_device_info {
	int cc; // compute capability
	// int nsm; // number of streaming multiprocessors
	// size_t smpb; // max. shared memory per block
	bool vmm; // virtual memory support
	size_t total_vram;
	};

	sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};

	std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};

	int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
	};

	const ggml_sycl_device_info & ggml_sycl_info();

	struct ggml_sycl_pool {
	virtual ~ggml_sycl_pool() = default;

	virtual void * alloc(size_t size, size_t * actual_size) = 0;
	virtual void free(void * ptr, size_t size) = 0;
	};

	template<typename T>
	struct ggml_sycl_pool_alloc {
	ggml_sycl_pool * pool = nullptr;
	T * ptr = nullptr;
	size_t actual_size = 0;

	explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
	}

	ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
	alloc(size);
	}

	~ggml_sycl_pool_alloc() {
	if (ptr != nullptr) {
	pool->free(ptr, actual_size);
	}
	}

	// size is in number of elements
	T * alloc(size_t size) {
	GGML_ASSERT(pool != nullptr);
	GGML_ASSERT(ptr == nullptr);
	ptr = (T ) pool->alloc(size sizeof(T), &this->actual_size);
	return ptr;
	}

	T * alloc(ggml_sycl_pool & pool, size_t size) {
	this->pool = &pool;
	return alloc(size);
	}

	T * get() {
	return ptr;
	}

	ggml_sycl_pool_alloc() = default;
	ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
	ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
	ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
	ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
	};

	// backend interface

	struct ggml_tensor_extra_gpu {
	void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
	// tensors
	dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
	[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
	};

	struct ggml_backend_sycl_context {
	int device;
	std::string name;

	queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };

	explicit ggml_backend_sycl_context(int device) :
	device(device),
	name(GGML_SYCL_NAME + std::to_string(device)) {
	}

	queue_ptr stream(int device, int stream) {
	if (qptrs[device][stream] == nullptr) {
	qptrs[device][stream] = &(dpct::get_device(device).default_queue());
	}
	return qptrs[device][stream];
	}

	queue_ptr stream() {
	return stream(device, 0);
	}

	#if GGML_SYCL_DNNL
	dnnl::engine make_engine(sycl::queue* q) {
	// Get the device associated with the queue
	sycl::device dev = q->get_device();
	// Get the context associated with the queue
	sycl::context ctx = q->get_context();
	const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
	return eng;
	}

	std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
	std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
	dnnl::stream stream_dnnl(int device, int _stream) {
	auto q = stream(device, _stream);
	return stream_dnnl(q);
	}
	dnnl::engine engine_dnnl(sycl::queue* qptr) {
	auto it = engine_map.find(qptr);
	if (it == engine_map.end()) {
	auto eng = make_engine(qptr);
	engine_map[qptr] = eng;
	return eng;
	}
	else
	{
	return it->second;
	}
	}
	dnnl::stream stream_dnnl(sycl::queue* qptr) {
	auto it = stream_map.find(qptr);
	if (it == stream_map.end()) {
	auto eng = engine_dnnl(qptr);
	auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
	stream_map[qptr] = stream;
	return stream;
	}
	else
	{
	return it->second;
	}
	}
	dnnl::stream stream_dnnl() {
	return stream_dnnl(device, 0);
	}
	#endif

	// pool
	std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];

	std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];

	static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);

	static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);

	ggml_sycl_pool & pool(int device) {
	if (pools[device] == nullptr) {
	pools[device] = new_pool_for_device(stream(device,0), device);
	}
	return *pools[device];
	}

	ggml_sycl_pool & pool() {
	return pool(device);
	}

	ggml_sycl_pool & host_pool(int device) {
	if (host_pools[device] == nullptr) {
	host_pools[device] = new_pool_for_host(stream(device, 0), device);
	}
	return *host_pools[device];
	}

	ggml_sycl_pool & host_pool() { return host_pool(device); }
	};

	// common device functions

	static __dpct_inline__ float warp_reduce_sum(float x,
	const sycl::nd_item<3>& item_ct1) {
	#pragma unroll
	for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
	/*
	DPCT1096:98: The right-most dimension of the work-group used in the SYCL
	kernel that calls this function may be less than "32". The function
	"dpct::permute_sub_group_by_xor" may return an unexpected result on the
	CPU device. Modify the size of the work-group to ensure that the value
	of the right-most dimension is a multiple of "32".
	*/
	x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
	}
	return x;
	}

	static __dpct_inline__ sycl::float2
	warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
	#pragma unroll
	for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
	a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
	mask);
	a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
	mask);
	}
	return a;
	}

	static __dpct_inline__ float warp_reduce_max(float x,
	const sycl::nd_item<3>& item_ct1) {
	#pragma unroll
	for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
	/*
	DPCT1096:97: The right-most dimension of the work-group used in the SYCL
	kernel that calls this function may be less than "32". The function
	"dpct::permute_sub_group_by_xor" may return an unexpected result on the
	CPU device. Modify the size of the work-group to ensure that the value
	of the right-most dimension is a multiple of "32".
	*/
	x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
	item_ct1.get_sub_group(), x, mask));
	}
	return x;
	}

	// Helper for vec loading aligned data
	template <typename Tp, int n>
	inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
	return reinterpret_cast<const sycl::vec<Tp, n>>(aligned_ptr);
	}

	// Helper for accessing pointers with no warnings
	template <typename Tp, int dim>
	static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
	return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
	}

	int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);

	typedef void (ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor src0,
	const ggml_tensor *src1,
	ggml_tensor dst, const float src0_dd,
	const float src1_dd, float dst_dd,
	const queue_ptr &main_stream);

	template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
	static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
	int ne0, int ne1, int ne2, int ne3,
	int ne10, int ne11, int ne12, int ne13,
	/int s0, / int s1, int s2, int s3,
	/int s10,/ int s11, int s12, int s13,
	const sycl::nd_item<3> &item_ct1) {
	const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
	item_ct1.get_local_id(2);
	const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
	item_ct1.get_local_id(1));
	const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
	item_ct1.get_local_id(0)) /
	ne3;
	const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
	item_ct1.get_local_id(0)) %
	ne3;

	if (i0s >= ne0 \|\| i1 >= ne1 \|\| i2 >= ne2 \|\| i3 >= ne3) {
	return;
	}

	const int i11 = i1 % ne11;
	const int i12 = i2 % ne12;
	const int i13 = i3 % ne13;

	const size_t i_src0 = i3s3 + i2s2 + i1*s1;
	const size_t i_src1 = i13s13 + i12s12 + i11*s11;
	const size_t i_dst = i_src0;

	const src0_t * src0_row = src0 + i_src0;
	const src1_t * src1_row = src1 + i_src1;
	dst_t * dst_row = dst + i_dst;

	for (int i0 = i0s; i0 < ne0;
	i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
	const int i10 = i0 % ne10;
	dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
	}
	}

	template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
	static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
	int ne0, int ne1, int ne2, int ne3,
	int ne10, int ne11, int ne12, int ne13,
	/int s0, / int s1, int s2, int s3,
	/int s10,/ int s11, int s12, int s13,
	const sycl::nd_item<3> &item_ct1) {

	const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
	item_ct1.get_local_id(2);

	const int i3 = i/(ne2ne1ne0);
	const int i2 = (i/(ne1*ne0)) % ne2;
	const int i1 = (i/ne0) % ne1;
	const int i0 = i % ne0;

	if (i0 >= ne0 \|\| i1 >= ne1 \|\| i2 >= ne2 \|\| i3 >= ne3) {
	return;
	}

	const int i11 = i1 % ne11;
	const int i12 = i2 % ne12;
	const int i13 = i3 % ne13;

	const size_t i_src0 = i3s3 + i2s2 + i1*s1;
	const size_t i_src1 = i13s13 + i12s12 + i11*s11;
	const size_t i_dst = i_src0;

	const src0_t * src0_row = src0 + i_src0;
	const src1_t * src1_row = src1 + i_src1;
	dst_t * dst_row = dst + i_dst;

	const int i10 = i0 % ne10;
	dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
	}


	template<float (*bin_op)(const float, const float)>
	struct bin_bcast_sycl {
	template <typename src0_t, typename src1_t, typename dst_t>
	void operator()(ggml_backend_sycl_context & ctx,
	const struct ggml_tensor *src0,
	const struct ggml_tensor src1, struct ggml_tensor dst,
	const src0_t src0_dd, const src1_t src1_dd, dst_t *dst_dd,
	queue_ptr stream) {

	GGML_TENSOR_BINARY_OP_LOCALS

	int nr0 = ne10/ne0;
	int nr1 = ne11/ne1;
	int nr2 = ne12/ne2;
	int nr3 = ne13/ne3;

	int nr[4] = { nr0, nr1, nr2, nr3 };

	// collapse dimensions until first broadcast dimension
	int64_t cne0[] = {ne0, ne1, ne2, ne3};
	int64_t cne1[] = {ne10, ne11, ne12, ne13};
	size_t cnb0[] = {nb0, nb1, nb2, nb3};
	size_t cnb1[] = {nb10, nb11, nb12, nb13};
	auto collapse = [](int64_t cne[]) {
	cne[0] *= cne[1];
	cne[1] = cne[2];
	cne[2] = cne[3];
	cne[3] = 1;
	};

	auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
	cnb[1] *= cne[1];
	cnb[2] *= cne[2];
	cnb[3] *= cne[3];
	};

	for (int i = 0; i < 4; i++) {
	if (nr[i] != 1) {
	break;
	}
	if (i > 0) {
	collapse_nb(cnb0, cne0);
	collapse_nb(cnb1, cne1);
	collapse(cne0);
	collapse(cne1);
	}
	}
	{
	int64_t ne0 = cne0[0];
	int64_t ne1 = cne0[1];
	int64_t ne2 = cne0[2];
	int64_t ne3 = cne0[3];

	int64_t ne10 = cne1[0];
	int64_t ne11 = cne1[1];
	int64_t ne12 = cne1[2];
	int64_t ne13 = cne1[3];

	size_t nb0 = cnb0[0];
	size_t nb1 = cnb0[1];
	size_t nb2 = cnb0[2];
	size_t nb3 = cnb0[3];

	size_t nb10 = cnb1[0];
	size_t nb11 = cnb1[1];
	size_t nb12 = cnb1[2];
	size_t nb13 = cnb1[3];

	size_t s0 = nb0 / sizeof(dst_t);
	size_t s1 = nb1 / sizeof(dst_t);
	size_t s2 = nb2 / sizeof(dst_t);
	size_t s3 = nb3 / sizeof(dst_t);

	size_t s10 = nb10 / sizeof(src1_t);
	size_t s11 = nb11 / sizeof(src1_t);
	size_t s12 = nb12 / sizeof(src1_t);
	size_t s13 = nb13 / sizeof(src1_t);

	GGML_ASSERT(s0 == 1);
	GGML_ASSERT(s10 == 1);

	const int block_size = 128;

	int64_t hne0 = std::max(ne0/2LL, 1LL);

	sycl::range<3> block_dims(1, 1, 1);
	block_dims[2] = std::min<unsigned int>(hne0, block_size);
	block_dims[1] = std::min<unsigned int>(
	ne1, block_size / (unsigned int)block_dims[2]);
	block_dims[0] = std::min(
	std::min<unsigned int>(
	ne2 * ne3, block_size / (unsigned int)block_dims[2] /
	(unsigned int)block_dims[1]),
	64U);

	sycl::range<3> block_nums(
	(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
	(ne1 + block_dims[1] - 1) / block_dims[1],
	(hne0 + block_dims[2] - 1) / block_dims[2]);

	if (block_nums[0] > 65535) {
	// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
	int block_num = (ne0ne1ne2*ne3 + block_size - 1) / block_size;
	{
	dpct::has_capability_or_fail(stream->get_device(),
	{sycl::aspect::fp16});

	stream->parallel_for(
	sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
	sycl::range<3>(1, 1, block_size),
	sycl::range<3>(1, 1, block_size)),
	[=](sycl::nd_item<3> item_ct1) {
	k_bin_bcast_unravel<bin_op>(
	src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
	ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
	s13, item_ct1);
	});
	}
	} else {
	/*
	DPCT1049:16: The work-group size passed to the SYCL kernel may
	exceed the limit. To get the device limit, query
	info::device::max_work_group_size. Adjust the work-group size if
	needed.
	*/
	dpct::has_capability_or_fail(stream->get_device(),
	{sycl::aspect::fp16});

	stream->parallel_for(
	sycl::nd_range<3>(block_nums * block_dims, block_dims),
	[=](sycl::nd_item<3> item_ct1) {
	k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
	ne2, ne3, ne10, ne11, ne12, ne13,
	s1, s2, s3, s11, s12, s13,
	item_ct1);
	});
	}
	}
	GGML_UNUSED(ctx);
	}
	};

	template <class op>
	inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
	const ggml_tensor src1, ggml_tensor dst,
	const float src0_dd, const float src1_dd,
	float *dst_dd,
	const queue_ptr &main_stream) {

	if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
	op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
	} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
	op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
	(sycl::half *)dst_dd, main_stream);
	} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
	op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
	main_stream);
	} else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
	op()(ctx, src0, src1, dst, (const int32_t )src0_dd, (const int32_t )src1_dd, (int32_t *)dst_dd,
	main_stream);
	} else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
	op()(ctx, src0, src1, dst, (const int16_t )src0_dd, (const int16_t )src1_dd, (int16_t *)dst_dd,
	main_stream);
	} else {
	fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
	ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
	GGML_ABORT("fatal error");
	}
	}

	bool gpu_has_xmx(sycl::device &dev);

	void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
	const ggml_tensor src1, ggml_tensor dst,
	const ggml_sycl_op_flatten_t op);

	#endif // GGML_SYCL_COMMON_HPP