mirror of
https://github.com/autc04/Retro68.git
synced 2024-12-01 11:52:47 +00:00
542 lines
17 KiB
C++
542 lines
17 KiB
C++
/*
|
|
Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of Intel Corporation nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
|
|
/*! \file
|
|
\brief The parts of the runtime library used only on the host
|
|
*/
|
|
|
|
#ifndef OFFLOAD_HOST_H_INCLUDED
|
|
#define OFFLOAD_HOST_H_INCLUDED
|
|
|
|
#ifndef TARGET_WINNT
|
|
#include <unistd.h>
|
|
#endif // TARGET_WINNT
|
|
#include "offload_common.h"
|
|
#include "offload_util.h"
|
|
#include "offload_engine.h"
|
|
#include "offload_env.h"
|
|
#include "offload_orsl.h"
|
|
#include "coi/coi_client.h"
|
|
|
|
// MIC engines.
|
|
DLL_LOCAL extern Engine* mic_engines;
|
|
DLL_LOCAL extern uint32_t mic_engines_total;
|
|
|
|
// DMA channel count used by COI and set via
|
|
// OFFLOAD_DMA_CHANNEL_COUNT environment variable
|
|
DLL_LOCAL extern uint32_t mic_dma_channel_count;
|
|
|
|
//! The target image is packed as follows.
|
|
/*! 1. 8 bytes containing the size of the target binary */
|
|
/*! 2. a null-terminated string which is the binary name */
|
|
/*! 3. <size> number of bytes that are the contents of the image */
|
|
/*! The address of symbol __offload_target_image
|
|
is the address of this structure. */
|
|
struct Image {
|
|
int64_t size; //!< Size in bytes of the target binary name and contents
|
|
char data[]; //!< The name and contents of the target image
|
|
};
|
|
|
|
// The offload descriptor.
|
|
class OffloadDescriptor
|
|
{
|
|
public:
|
|
enum OmpAsyncLastEventType {
|
|
c_last_not, // not last event
|
|
c_last_write, // the last event that is write
|
|
c_last_read, // the last event that is read
|
|
c_last_runfunc // the last event that is runfunction
|
|
};
|
|
|
|
OffloadDescriptor(
|
|
int index,
|
|
_Offload_status *status,
|
|
bool is_mandatory,
|
|
bool is_openmp,
|
|
OffloadHostTimerData * timer_data
|
|
) :
|
|
m_device(mic_engines[index == -1 ? 0 : index % mic_engines_total]),
|
|
m_is_mandatory(is_mandatory),
|
|
m_is_openmp(is_openmp),
|
|
m_inout_buf(0),
|
|
m_func_desc(0),
|
|
m_func_desc_size(0),
|
|
m_num_in_dependencies(0),
|
|
m_p_in_dependencies(0),
|
|
m_in_deps(0),
|
|
m_in_deps_total(0),
|
|
m_in_deps_allocated(0),
|
|
m_out_deps(0),
|
|
m_out_deps_total(0),
|
|
m_out_deps_allocated(0),
|
|
m_vars(0),
|
|
m_vars_extra(0),
|
|
m_status(status),
|
|
m_timer_data(timer_data),
|
|
m_out_with_preallocated(false),
|
|
m_preallocated_alloc(false),
|
|
m_traceback_called(false),
|
|
m_stream(-1),
|
|
m_signal(0),
|
|
m_has_signal(0),
|
|
m_omp_async_last_event_type(c_last_not)
|
|
{
|
|
m_wait_all_devices = index == -1;
|
|
}
|
|
|
|
~OffloadDescriptor()
|
|
{
|
|
if (m_in_deps != 0) {
|
|
free(m_in_deps);
|
|
}
|
|
if (m_out_deps != 0) {
|
|
free(m_out_deps);
|
|
}
|
|
if (m_func_desc != 0) {
|
|
free(m_func_desc);
|
|
}
|
|
if (m_vars != 0) {
|
|
free(m_vars);
|
|
free(m_vars_extra);
|
|
}
|
|
}
|
|
|
|
bool offload(const char *name, bool is_empty,
|
|
VarDesc *vars, VarDesc2 *vars2, int vars_total,
|
|
const void **waits, int num_waits, const void **signal,
|
|
int entry_id, const void *stack_addr,
|
|
OffloadFlags offload_flags);
|
|
|
|
bool offload_finish(bool is_traceback);
|
|
|
|
bool is_signaled();
|
|
|
|
OffloadHostTimerData* get_timer_data() const {
|
|
return m_timer_data;
|
|
}
|
|
|
|
void set_stream(_Offload_stream stream) {
|
|
m_stream = stream;
|
|
}
|
|
|
|
_Offload_stream get_stream() {
|
|
return(m_stream);
|
|
}
|
|
|
|
Engine& get_device() {
|
|
return m_device;
|
|
}
|
|
|
|
void* get_signal() {
|
|
return(m_signal);
|
|
}
|
|
|
|
void set_signal(const void* signal) {
|
|
m_has_signal = 1;
|
|
m_signal = const_cast<void*>(signal);
|
|
}
|
|
|
|
void cleanup();
|
|
|
|
uint32_t m_event_count;
|
|
bool m_has_signal;
|
|
|
|
private:
|
|
bool offload_wrap(const char *name, bool is_empty,
|
|
VarDesc *vars, VarDesc2 *vars2, int vars_total,
|
|
const void **waits, int num_waits, const void **signal,
|
|
int entry_id, const void *stack_addr,
|
|
OffloadFlags offload_flags);
|
|
bool wait_dependencies(const void **waits, int num_waits,
|
|
_Offload_stream stream);
|
|
bool setup_descriptors(VarDesc *vars, VarDesc2 *vars2, int vars_total,
|
|
int entry_id, const void *stack_addr);
|
|
bool setup_misc_data(const char *name);
|
|
bool send_pointer_data(bool is_async, void* info);
|
|
bool send_noncontiguous_pointer_data(
|
|
int i,
|
|
PtrData* src_buf,
|
|
PtrData* dst_buf,
|
|
COIEVENT *event,
|
|
uint64_t &sent_data,
|
|
uint32_t in_deps_amount,
|
|
COIEVENT *in_deps
|
|
);
|
|
bool receive_noncontiguous_pointer_data(
|
|
int i,
|
|
COIBUFFER dst_buf,
|
|
COIEVENT *event,
|
|
uint64_t &received_data,
|
|
uint32_t in_deps_amount,
|
|
COIEVENT *in_deps
|
|
);
|
|
|
|
bool gather_copyin_data();
|
|
|
|
bool compute(void *);
|
|
|
|
bool receive_pointer_data(bool is_async, bool first_run, void * info);
|
|
bool scatter_copyout_data();
|
|
|
|
bool find_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
|
|
int64_t length, bool is_targptr,
|
|
bool error_does_not_exist = true);
|
|
|
|
void find_device_ptr( int64_t* &device_ptr,
|
|
void *host_ptr);
|
|
|
|
bool alloc_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
|
|
int64_t length, int64_t alloc_disp, int align,
|
|
bool is_targptr, bool is_prealloc, bool pin);
|
|
bool create_preallocated_buffer(PtrData* ptr_data, void *base);
|
|
bool init_static_ptr_data(PtrData *ptr_data);
|
|
bool init_mic_address(PtrData *ptr_data);
|
|
bool offload_stack_memory_manager(
|
|
const void * stack_begin,
|
|
int routine_id,
|
|
int buf_size,
|
|
int align,
|
|
bool thread_specific_function_locals,
|
|
bool *is_new);
|
|
char *get_this_threads_cpu_stack_addr(
|
|
const void * stack_begin,
|
|
int routine_id,
|
|
bool thread_specific_function_locals);
|
|
PtrData *get_this_threads_mic_stack_addr(
|
|
const void * stack_begin,
|
|
int routine_id,
|
|
bool thread_specific_function_locals);
|
|
bool nullify_target_stack(COIBUFFER targ_buf, uint64_t size);
|
|
|
|
bool gen_var_descs_for_pointer_array(int i);
|
|
|
|
void get_stream_in_dependencies(uint32_t &in_deps_amount,
|
|
COIEVENT* &in_deps);
|
|
|
|
void report_coi_error(error_types msg, COIRESULT res);
|
|
_Offload_result translate_coi_error(COIRESULT res) const;
|
|
|
|
void setup_omp_async_info();
|
|
|
|
void setup_use_device_ptr(int i);
|
|
|
|
void register_event_call_back(void (*)(
|
|
COIEVENT,
|
|
const COIRESULT,
|
|
const void*),
|
|
const COIEVENT *event,
|
|
const void *info);
|
|
|
|
void register_omp_event_call_back(const COIEVENT *event, const void *info);
|
|
|
|
private:
|
|
typedef std::list<COIBUFFER> BufferList;
|
|
|
|
// extra data associated with each variable descriptor
|
|
struct VarExtra {
|
|
PtrData* src_data;
|
|
PtrData* dst_data;
|
|
AutoData* auto_data;
|
|
int64_t cpu_disp;
|
|
int64_t cpu_offset;
|
|
void *alloc;
|
|
union {
|
|
CeanReadRanges *read_rng_src;
|
|
NonContigDesc *noncont_desc;
|
|
};
|
|
CeanReadRanges *read_rng_dst;
|
|
int64_t ptr_arr_offset;
|
|
bool is_arr_ptr_el;
|
|
OmpAsyncLastEventType omp_last_event_type;
|
|
int64_t pointer_offset;
|
|
uint16_t type_src;
|
|
uint16_t type_dst;
|
|
};
|
|
|
|
template<typename T> class ReadArrElements {
|
|
public:
|
|
ReadArrElements():
|
|
ranges(NULL),
|
|
el_size(sizeof(T)),
|
|
offset(0),
|
|
count(0),
|
|
is_empty(true),
|
|
base(NULL)
|
|
{}
|
|
|
|
bool read_next(bool flag)
|
|
{
|
|
if (flag != 0) {
|
|
if (is_empty) {
|
|
if (ranges) {
|
|
if (!get_next_range(ranges, &offset)) {
|
|
// ranges are over
|
|
return false;
|
|
}
|
|
}
|
|
// all contiguous elements are over
|
|
else if (count != 0) {
|
|
return false;
|
|
}
|
|
|
|
length_cur = size;
|
|
}
|
|
else {
|
|
offset += el_size;
|
|
}
|
|
val = (T)get_el_value(base, offset, el_size);
|
|
length_cur -= el_size;
|
|
count++;
|
|
is_empty = length_cur == 0;
|
|
}
|
|
return true;
|
|
}
|
|
public:
|
|
CeanReadRanges * ranges;
|
|
T val;
|
|
int el_size;
|
|
int64_t size,
|
|
offset,
|
|
length_cur;
|
|
bool is_empty;
|
|
int count;
|
|
char *base;
|
|
};
|
|
|
|
// ptr_data for persistent auto objects
|
|
PtrData* m_stack_ptr_data;
|
|
PtrDataList m_destroy_stack;
|
|
|
|
// Engine
|
|
Engine& m_device;
|
|
|
|
// true for offload_wait target(mic) stream(0)
|
|
bool m_wait_all_devices;
|
|
|
|
// if true offload is mandatory
|
|
bool m_is_mandatory;
|
|
|
|
// if true offload has openmp origin
|
|
const bool m_is_openmp;
|
|
|
|
// The Marshaller for the inputs of the offloaded region.
|
|
Marshaller m_in;
|
|
|
|
// The Marshaller for the outputs of the offloaded region.
|
|
Marshaller m_out;
|
|
|
|
// List of buffers that are passed to dispatch call
|
|
BufferList m_compute_buffers;
|
|
|
|
// List of buffers that need to be destroyed at the end of offload
|
|
BufferList m_destroy_buffers;
|
|
|
|
// Variable descriptors
|
|
VarDesc* m_vars;
|
|
VarExtra* m_vars_extra;
|
|
int m_vars_total;
|
|
|
|
// Pointer to a user-specified status variable
|
|
_Offload_status *m_status;
|
|
|
|
// Function descriptor
|
|
FunctionDescriptor* m_func_desc;
|
|
uint32_t m_func_desc_size;
|
|
|
|
// Buffer for transferring copyin/copyout data
|
|
COIBUFFER m_inout_buf;
|
|
|
|
|
|
// Dependencies
|
|
COIEVENT *m_in_deps;
|
|
uint32_t m_in_deps_total;
|
|
uint32_t m_in_deps_allocated;
|
|
COIEVENT *m_out_deps;
|
|
uint32_t m_out_deps_total;
|
|
uint32_t m_out_deps_allocated;
|
|
|
|
// 2 variables defines input dependencies for current COI API.
|
|
// The calls to routines as BufferWrite/PipelineRunFunction/BufferRead
|
|
// is supposed to have input dependencies.
|
|
// 2 variables below defines the number and vector of dependencies
|
|
// in every current moment of offload.
|
|
// So any phase of offload can use its values as input dependencies
|
|
// for the COI API that the phase calls.
|
|
// It means that all phases (of Write, RunFunction,Read) must keep
|
|
// the variables correct to be used by following phase.
|
|
// If some consequent offloads are connected (i.e. by the same stream)
|
|
// the final 2 variables of the offload is used as initial inputs
|
|
// for the next offload.
|
|
uint32_t m_num_in_dependencies;
|
|
COIEVENT *m_p_in_dependencies;
|
|
|
|
// Stream
|
|
_Offload_stream m_stream;
|
|
|
|
// Signal
|
|
void* m_signal;
|
|
|
|
// Timer data
|
|
OffloadHostTimerData *m_timer_data;
|
|
|
|
// copyin/copyout data length
|
|
uint64_t m_in_datalen;
|
|
uint64_t m_out_datalen;
|
|
|
|
// a boolean value calculated in setup_descriptors. If true we need to do
|
|
// a run function on the target. Otherwise it may be optimized away.
|
|
bool m_need_runfunction;
|
|
|
|
// initialized value of m_need_runfunction;
|
|
// is used to recognize offload_transfer
|
|
bool m_initial_need_runfunction;
|
|
|
|
// a Boolean value set to true when OUT clauses with preallocated targetptr
|
|
// is encountered to indicate that call receive_pointer_data needs to be
|
|
// invoked again after call to scatter_copyout_data.
|
|
bool m_out_with_preallocated;
|
|
|
|
// a Boolean value set to true if an alloc_if(1) is used with preallocated
|
|
// targetptr to indicate the need to scatter_copyout_data even for
|
|
// async offload
|
|
bool m_preallocated_alloc;
|
|
|
|
// a Boolean value set to true if traceback routine is called
|
|
bool m_traceback_called;
|
|
|
|
OmpAsyncLastEventType m_omp_async_last_event_type;
|
|
};
|
|
|
|
// Initialization types for MIC
|
|
enum OffloadInitType {
|
|
c_init_on_start, // all devices before entering main
|
|
c_init_on_offload, // single device before starting the first offload
|
|
c_init_on_offload_all // all devices before starting the first offload
|
|
};
|
|
|
|
// Determines if MIC code is an executable or a shared library
|
|
extern "C" bool __offload_target_image_is_executable(const void *target_image);
|
|
|
|
// Initializes library and registers specified offload image.
|
|
extern "C" bool __offload_register_image(const void* image);
|
|
extern "C" void __offload_unregister_image(const void* image);
|
|
|
|
// Registers asynchronous task completion callback
|
|
extern "C" void __offload_register_task_callback(void (*cb)(void *));
|
|
|
|
// Initializes offload runtime library.
|
|
DLL_LOCAL extern int __offload_init_library(void);
|
|
|
|
// thread data for associating pipelines with threads
|
|
DLL_LOCAL extern pthread_key_t mic_thread_key;
|
|
|
|
// location of offload_main executable
|
|
// To be used if the main application has no offload and is not built
|
|
// with -offload but dynamic library linked in has offload pragma
|
|
DLL_LOCAL extern char* mic_device_main;
|
|
|
|
// Environment variables for devices
|
|
DLL_LOCAL extern MicEnvVar mic_env_vars;
|
|
|
|
// CPU frequency
|
|
DLL_LOCAL extern uint64_t cpu_frequency;
|
|
|
|
// LD_LIBRARY_PATH for KNC libraries
|
|
DLL_LOCAL extern char* knc_library_path;
|
|
|
|
// LD_LIBRARY_PATH for KNL libraries
|
|
DLL_LOCAL extern char* knl_library_path;
|
|
|
|
// stack size for target
|
|
DLL_LOCAL extern uint32_t mic_stack_size;
|
|
|
|
// Preallocated memory size for buffers on MIC
|
|
DLL_LOCAL extern uint64_t mic_buffer_size;
|
|
|
|
// Preallocated 4K page memory size for buffers on MIC
|
|
DLL_LOCAL extern uint64_t mic_4k_buffer_size;
|
|
|
|
// Preallocated 2M page memory size for buffers on MIC
|
|
DLL_LOCAL extern uint64_t mic_2m_buffer_size;
|
|
|
|
// Setting controlling inout proxy
|
|
DLL_LOCAL extern bool mic_proxy_io;
|
|
DLL_LOCAL extern char* mic_proxy_fs_root;
|
|
|
|
// Threshold for creating buffers with large pages
|
|
DLL_LOCAL extern uint64_t __offload_use_2mb_buffers;
|
|
|
|
// offload initialization type
|
|
DLL_LOCAL extern OffloadInitType __offload_init_type;
|
|
|
|
// Device number to offload to when device is not explicitly specified.
|
|
DLL_LOCAL extern int __omp_device_num;
|
|
|
|
// target executable
|
|
DLL_LOCAL extern TargetImage* __target_exe;
|
|
|
|
// is true if last loaded image is dll
|
|
DLL_LOCAL extern bool __current_image_is_dll;
|
|
// is true if myo library is loaded when dll is loaded
|
|
DLL_LOCAL extern bool __myo_init_in_so;
|
|
|
|
// IDB support
|
|
|
|
// Called by the offload runtime after initialization of offload infrastructure
|
|
// has been completed.
|
|
extern "C" void __dbg_target_so_loaded();
|
|
|
|
// Called by the offload runtime when the offload infrastructure is about to be
|
|
// shut down, currently at application exit.
|
|
extern "C" void __dbg_target_so_unloaded();
|
|
|
|
// Null-terminated string containing path to the process image of the hosting
|
|
// application (offload_main)
|
|
#define MAX_TARGET_NAME 512
|
|
extern "C" char __dbg_target_exe_name[MAX_TARGET_NAME];
|
|
|
|
// Integer specifying the process id
|
|
extern "C" pid_t __dbg_target_so_pid;
|
|
|
|
// Integer specifying the 0-based device number
|
|
extern "C" int __dbg_target_id;
|
|
|
|
// Set to non-zero by the host-side debugger to enable offload debugging
|
|
// support
|
|
extern "C" int __dbg_is_attached;
|
|
|
|
// Major version of the debugger support API
|
|
extern "C" const int __dbg_api_major_version;
|
|
|
|
// Minor version of the debugger support API
|
|
extern "C" const int __dbg_api_minor_version;
|
|
|
|
#endif // OFFLOAD_HOST_H_INCLUDED
|