mirror of
https://github.com/autc04/Retro68.git
synced 2024-12-28 14:31:50 +00:00
670 lines
18 KiB
C++
670 lines
18 KiB
C++
/*
|
|
Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of Intel Corporation nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
|
|
#ifndef OFFLOAD_ENGINE_H_INCLUDED
|
|
#define OFFLOAD_ENGINE_H_INCLUDED
|
|
|
|
#include <limits.h>
|
|
#include <bitset>
|
|
#include <list>
|
|
#include <set>
|
|
#include <map>
|
|
#include "offload_common.h"
|
|
#include "coi/coi_client.h"
|
|
|
|
#define SIGNAL_IS_REMOVED ((OffloadDescriptor *)-1)
|
|
const int64_t no_stream = -1;
|
|
|
|
// Address range
|
|
class MemRange {
|
|
public:
|
|
MemRange() : m_start(0), m_length(0) {}
|
|
MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
|
|
|
|
const void* start() const {
|
|
return m_start;
|
|
}
|
|
|
|
const void* end() const {
|
|
return static_cast<const char*>(m_start) + m_length;
|
|
}
|
|
|
|
uint64_t length() const {
|
|
return m_length;
|
|
}
|
|
|
|
// returns true if given range overlaps with another one
|
|
bool overlaps(const MemRange &o) const {
|
|
// Two address ranges A[start, end) and B[start,end) overlap
|
|
// if A.start < B.end and A.end > B.start.
|
|
return start() < o.end() && end() > o.start();
|
|
}
|
|
|
|
// returns true if given range contains the other range
|
|
bool contains(const MemRange &o) const {
|
|
return start() <= o.start() && o.end() <= end();
|
|
}
|
|
|
|
private:
|
|
const void* m_start;
|
|
uint64_t m_length;
|
|
};
|
|
|
|
// Data associated with a pointer variable
|
|
class PtrData {
|
|
public:
|
|
PtrData(const void *addr, uint64_t len) :
|
|
cpu_addr(addr, len), cpu_buf(0),
|
|
mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
|
|
ref_count(0), is_static(false)
|
|
{}
|
|
|
|
//
|
|
// Copy constructor
|
|
//
|
|
PtrData(const PtrData& ptr):
|
|
cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
|
|
mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
|
|
mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
|
|
ref_count(ptr.ref_count), is_static(ptr.is_static)
|
|
{}
|
|
|
|
bool operator<(const PtrData &o) const {
|
|
// Variables are sorted by the CPU start address.
|
|
// Overlapping memory ranges are considered equal.
|
|
return (cpu_addr.start() < o.cpu_addr.start()) &&
|
|
!cpu_addr.overlaps(o.cpu_addr);
|
|
}
|
|
|
|
long add_reference() {
|
|
if (is_static) {
|
|
return LONG_MAX;
|
|
}
|
|
#ifndef TARGET_WINNT
|
|
return __sync_fetch_and_add(&ref_count, 1);
|
|
#else // TARGET_WINNT
|
|
return _InterlockedIncrement(&ref_count) - 1;
|
|
#endif // TARGET_WINNT
|
|
}
|
|
|
|
long remove_reference() {
|
|
if (is_static) {
|
|
return LONG_MAX;
|
|
}
|
|
#ifndef TARGET_WINNT
|
|
return __sync_sub_and_fetch(&ref_count, 1);
|
|
#else // TARGET_WINNT
|
|
return _InterlockedDecrement(&ref_count);
|
|
#endif // TARGET_WINNT
|
|
}
|
|
|
|
long get_reference() const {
|
|
if (is_static) {
|
|
return LONG_MAX;
|
|
}
|
|
return ref_count;
|
|
}
|
|
|
|
public:
|
|
// CPU address range
|
|
const MemRange cpu_addr;
|
|
|
|
// CPU and MIC buffers
|
|
COIBUFFER cpu_buf;
|
|
COIBUFFER mic_buf;
|
|
|
|
// placeholder for buffer address on mic
|
|
uint64_t mic_addr;
|
|
|
|
uint64_t alloc_disp;
|
|
|
|
// additional offset to pointer data on MIC for improving bandwidth for
|
|
// data which is not 4K aligned
|
|
uint32_t mic_offset;
|
|
|
|
// if true buffers are created from static memory
|
|
bool is_static;
|
|
mutex_t alloc_ptr_data_lock;
|
|
|
|
private:
|
|
// reference count for the entry
|
|
long ref_count;
|
|
};
|
|
|
|
typedef std::list<PtrData*> PtrDataList;
|
|
|
|
class PtrDataTable {
|
|
public:
|
|
typedef std::set<PtrData> PtrSet;
|
|
|
|
PtrData* find_ptr_data(const void *ptr) {
|
|
m_ptr_lock.lock();
|
|
PtrSet::iterator res = list.find(PtrData(ptr, 0));
|
|
|
|
m_ptr_lock.unlock();
|
|
if (res == list.end()) {
|
|
return 0;
|
|
}
|
|
return const_cast<PtrData*>(res.operator->());
|
|
}
|
|
|
|
PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
|
|
m_ptr_lock.lock();
|
|
std::pair<PtrSet::iterator, bool> res =
|
|
list.insert(PtrData(ptr, len));
|
|
|
|
PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
|
|
m_ptr_lock.unlock();
|
|
|
|
is_new = res.second;
|
|
if (is_new) {
|
|
// It's necessary to lock as soon as possible.
|
|
// unlock must be done at call site of insert_ptr_data at
|
|
// branch for is_new
|
|
ptr_data->alloc_ptr_data_lock.lock();
|
|
}
|
|
return ptr_data;
|
|
}
|
|
|
|
void remove_ptr_data(const void *ptr) {
|
|
m_ptr_lock.lock();
|
|
list.erase(PtrData(ptr, 0));
|
|
m_ptr_lock.unlock();
|
|
}
|
|
private:
|
|
|
|
PtrSet list;
|
|
mutex_t m_ptr_lock;
|
|
};
|
|
|
|
// Data associated with automatic variable
|
|
class AutoData {
|
|
public:
|
|
AutoData(const void *addr, uint64_t len) :
|
|
cpu_addr(addr, len), ref_count(0)
|
|
{}
|
|
|
|
bool operator<(const AutoData &o) const {
|
|
// Variables are sorted by the CPU start address.
|
|
// Overlapping memory ranges are considered equal.
|
|
return (cpu_addr.start() < o.cpu_addr.start()) &&
|
|
!cpu_addr.overlaps(o.cpu_addr);
|
|
}
|
|
|
|
long add_reference() {
|
|
#ifndef TARGET_WINNT
|
|
return __sync_fetch_and_add(&ref_count, 1);
|
|
#else // TARGET_WINNT
|
|
return _InterlockedIncrement(&ref_count) - 1;
|
|
#endif // TARGET_WINNT
|
|
}
|
|
|
|
long remove_reference() {
|
|
#ifndef TARGET_WINNT
|
|
return __sync_sub_and_fetch(&ref_count, 1);
|
|
#else // TARGET_WINNT
|
|
return _InterlockedDecrement(&ref_count);
|
|
#endif // TARGET_WINNT
|
|
}
|
|
|
|
long nullify_reference() {
|
|
#ifndef TARGET_WINNT
|
|
return __sync_lock_test_and_set(&ref_count, 0);
|
|
#else // TARGET_WINNT
|
|
return _InterlockedExchange(&ref_count,0);
|
|
#endif // TARGET_WINNT
|
|
}
|
|
|
|
long get_reference() const {
|
|
return ref_count;
|
|
}
|
|
|
|
public:
|
|
// CPU address range
|
|
const MemRange cpu_addr;
|
|
|
|
private:
|
|
// reference count for the entry
|
|
long ref_count;
|
|
};
|
|
|
|
// Set of autimatic variables
|
|
typedef std::set<AutoData> AutoSet;
|
|
|
|
// Target image data
|
|
struct TargetImage
|
|
{
|
|
TargetImage(const char *_name, const void *_data, uint64_t _size,
|
|
const char *_origin, uint64_t _offset) :
|
|
name(_name), data(_data), size(_size),
|
|
origin(_origin), offset(_offset)
|
|
{}
|
|
|
|
// library name
|
|
const char* name;
|
|
|
|
// contents and size
|
|
const void* data;
|
|
uint64_t size;
|
|
|
|
// file of origin and offset within that file
|
|
const char* origin;
|
|
uint64_t offset;
|
|
};
|
|
|
|
typedef std::list<TargetImage> TargetImageList;
|
|
|
|
// dynamic library and Image associated with lib
|
|
struct DynLib
|
|
{
|
|
DynLib(const char *_name, const void *_data,
|
|
COILIBRARY _lib) :
|
|
name(_name), data(_data), lib(_lib)
|
|
{}
|
|
// library name
|
|
const char* name;
|
|
|
|
// contents
|
|
const void* data;
|
|
|
|
COILIBRARY lib;
|
|
};
|
|
typedef std::list<DynLib> DynLibList;
|
|
|
|
// Data associated with persistent auto objects
|
|
struct PersistData
|
|
{
|
|
PersistData(const void *addr, uint64_t routine_num,
|
|
uint64_t size, uint64_t thread) :
|
|
stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread)
|
|
{
|
|
stack_ptr_data = new PtrData(0, size);
|
|
}
|
|
// 1-st key value - beginning of the stack at CPU
|
|
const void * stack_cpu_addr;
|
|
// 2-nd key value - identifier of routine invocation at CPU
|
|
uint64_t routine_id;
|
|
// 3-rd key value - thread identifier
|
|
uint64_t thread_id;
|
|
|
|
// corresponded PtrData; only stack_ptr_data->mic_buf is used
|
|
PtrData * stack_ptr_data;
|
|
// used to get offset of the variable in stack buffer
|
|
char * cpu_stack_addr;
|
|
};
|
|
|
|
typedef std::list<PersistData> PersistDataList;
|
|
|
|
// Data associated with stream
|
|
struct Stream
|
|
{
|
|
Stream(int device, int num_of_cpus) :
|
|
m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0),
|
|
m_device(device)
|
|
{}
|
|
~Stream() {
|
|
if (m_pipeline) {
|
|
COI::PipelineDestroy(m_pipeline);
|
|
}
|
|
}
|
|
|
|
COIPIPELINE get_pipeline(void) {
|
|
return(m_pipeline);
|
|
}
|
|
|
|
int get_device(void) {
|
|
return(m_device);
|
|
}
|
|
|
|
int get_cpu_number(void) {
|
|
return(m_number_of_cpus);
|
|
}
|
|
|
|
void set_pipeline(COIPIPELINE pipeline) {
|
|
m_pipeline = pipeline;
|
|
}
|
|
|
|
OffloadDescriptor* get_last_offload(void) {
|
|
return(m_last_offload);
|
|
}
|
|
|
|
void set_last_offload(OffloadDescriptor* last_offload) {
|
|
m_last_offload = last_offload;
|
|
}
|
|
|
|
static Stream* find_stream(uint64_t handle, bool remove);
|
|
|
|
static _Offload_stream add_stream(int device, int number_of_cpus) {
|
|
m_stream_lock.lock();
|
|
all_streams[++m_streams_count] = new Stream(device, number_of_cpus);
|
|
m_stream_lock.unlock();
|
|
return(m_streams_count);
|
|
}
|
|
|
|
typedef std::map<uint64_t, Stream*> StreamMap;
|
|
|
|
static uint64_t m_streams_count;
|
|
static StreamMap all_streams;
|
|
static mutex_t m_stream_lock;
|
|
|
|
int m_device;
|
|
|
|
// number of cpus
|
|
int m_number_of_cpus;
|
|
|
|
// The pipeline associated with the stream
|
|
COIPIPELINE m_pipeline;
|
|
|
|
// The last offload occured via the stream
|
|
OffloadDescriptor* m_last_offload;
|
|
|
|
// Cpus used by the stream
|
|
std::bitset<COI_MAX_HW_THREADS> m_stream_cpus;
|
|
};
|
|
|
|
typedef std::map<uint64_t, Stream*> StreamMap;
|
|
|
|
// class representing a single engine
|
|
struct Engine {
|
|
friend void __offload_init_library_once(void);
|
|
friend void __offload_fini_library(void);
|
|
|
|
#define check_result(res, tag, ...) \
|
|
{ \
|
|
if (res == COI_PROCESS_DIED) { \
|
|
fini_process(true); \
|
|
exit(1); \
|
|
} \
|
|
if (res != COI_SUCCESS) { \
|
|
__liboffload_error_support(tag, __VA_ARGS__); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
int get_logical_index() const {
|
|
return m_index;
|
|
}
|
|
|
|
int get_physical_index() const {
|
|
return m_physical_index;
|
|
}
|
|
|
|
const COIPROCESS& get_process() const {
|
|
return m_process;
|
|
}
|
|
|
|
uint64_t get_thread_id(void);
|
|
|
|
// initialize device
|
|
void init(void);
|
|
|
|
// unload library
|
|
void unload_library(const void *data, const char *name);
|
|
|
|
// add new library
|
|
void add_lib(const TargetImage &lib)
|
|
{
|
|
m_lock.lock();
|
|
m_ready = false;
|
|
m_images.push_back(lib);
|
|
m_lock.unlock();
|
|
}
|
|
|
|
COIRESULT compute(
|
|
_Offload_stream stream,
|
|
const std::list<COIBUFFER> &buffers,
|
|
const void* data,
|
|
uint16_t data_size,
|
|
void* ret,
|
|
uint16_t ret_size,
|
|
uint32_t num_deps,
|
|
const COIEVENT* deps,
|
|
COIEVENT* event
|
|
);
|
|
|
|
#ifdef MYO_SUPPORT
|
|
// temporary workaround for blocking behavior for myoiLibInit/Fini calls
|
|
void init_myo(COIEVENT *event) {
|
|
COIRESULT res;
|
|
res = COI::PipelineRunFunction(get_pipeline(),
|
|
m_funcs[c_func_myo_init],
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
event);
|
|
check_result(res, c_pipeline_run_func, m_index, res);
|
|
}
|
|
|
|
void fini_myo(COIEVENT *event) {
|
|
COIRESULT res;
|
|
res = COI::PipelineRunFunction(get_pipeline(),
|
|
m_funcs[c_func_myo_fini],
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
event);
|
|
check_result(res, c_pipeline_run_func, m_index, res);
|
|
}
|
|
#endif // MYO_SUPPORT
|
|
|
|
//
|
|
// Memory association table
|
|
//
|
|
PtrData* find_ptr_data(const void *ptr) {
|
|
return m_ptr_set.find_ptr_data(ptr);
|
|
}
|
|
|
|
PtrData* find_targetptr_data(const void *ptr) {
|
|
return m_targetptr_set.find_ptr_data(ptr);
|
|
}
|
|
|
|
PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
|
|
return m_ptr_set.insert_ptr_data(ptr, len, is_new);
|
|
}
|
|
|
|
PtrData* insert_targetptr_data(const void *ptr, uint64_t len,
|
|
bool &is_new) {
|
|
return m_targetptr_set.insert_ptr_data(ptr, len, is_new);
|
|
}
|
|
|
|
void remove_ptr_data(const void *ptr) {
|
|
m_ptr_set.remove_ptr_data(ptr);
|
|
}
|
|
|
|
void remove_targetptr_data(const void *ptr) {
|
|
m_targetptr_set.remove_ptr_data(ptr);
|
|
}
|
|
|
|
//
|
|
// Automatic variables
|
|
//
|
|
AutoData* find_auto_data(const void *ptr) {
|
|
AutoSet &auto_vars = get_auto_vars();
|
|
AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
|
|
if (res == auto_vars.end()) {
|
|
return 0;
|
|
}
|
|
return const_cast<AutoData*>(res.operator->());
|
|
}
|
|
|
|
AutoData* insert_auto_data(const void *ptr, uint64_t len) {
|
|
AutoSet &auto_vars = get_auto_vars();
|
|
std::pair<AutoSet::iterator, bool> res =
|
|
auto_vars.insert(AutoData(ptr, len));
|
|
return const_cast<AutoData*>(res.first.operator->());
|
|
}
|
|
|
|
void remove_auto_data(const void *ptr) {
|
|
get_auto_vars().erase(AutoData(ptr, 0));
|
|
}
|
|
|
|
//
|
|
// Signals
|
|
//
|
|
void add_signal(const void *signal, OffloadDescriptor *desc) {
|
|
m_signal_lock.lock();
|
|
m_signal_map[signal] = desc;
|
|
m_signal_lock.unlock();
|
|
}
|
|
|
|
OffloadDescriptor* find_signal(const void *signal, bool remove) {
|
|
OffloadDescriptor *desc = 0;
|
|
|
|
m_signal_lock.lock();
|
|
{
|
|
SignalMap::iterator it = m_signal_map.find(signal);
|
|
if (it != m_signal_map.end()) {
|
|
desc = it->second;
|
|
if (remove) {
|
|
it->second = SIGNAL_IS_REMOVED;
|
|
}
|
|
}
|
|
}
|
|
m_signal_lock.unlock();
|
|
|
|
return desc;
|
|
}
|
|
|
|
void stream_destroy(_Offload_stream handle);
|
|
|
|
COIPIPELINE get_pipeline(_Offload_stream stream);
|
|
|
|
StreamMap get_stream_map() {
|
|
return m_stream_map;
|
|
}
|
|
|
|
// stop device process
|
|
void fini_process(bool verbose);
|
|
|
|
// list of stacks active at the engine
|
|
PersistDataList m_persist_list;
|
|
|
|
private:
|
|
Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
|
|
m_proc_number(0)
|
|
{}
|
|
|
|
~Engine() {
|
|
for (StreamMap::iterator it = m_stream_map.begin();
|
|
it != m_stream_map.end(); it++) {
|
|
Stream * stream = it->second;
|
|
delete stream;
|
|
}
|
|
if (m_process != 0) {
|
|
fini_process(false);
|
|
}
|
|
}
|
|
|
|
// set indexes
|
|
void set_indexes(int logical_index, int physical_index) {
|
|
m_index = logical_index;
|
|
m_physical_index = physical_index;
|
|
}
|
|
|
|
// start process on device
|
|
void init_process();
|
|
|
|
void load_libraries(void);
|
|
void init_ptr_data(void);
|
|
|
|
// performs library intialization on the device side
|
|
pid_t init_device(void);
|
|
|
|
private:
|
|
// get pipeline associated with a calling thread
|
|
COIPIPELINE get_pipeline(void);
|
|
|
|
// get automatic vars set associated with the calling thread
|
|
AutoSet& get_auto_vars(void);
|
|
|
|
// destructor for thread data
|
|
static void destroy_thread_data(void *data);
|
|
|
|
private:
|
|
typedef std::set<PtrData> PtrSet;
|
|
typedef std::map<const void*, OffloadDescriptor*> SignalMap;
|
|
|
|
// device indexes
|
|
int m_index;
|
|
int m_physical_index;
|
|
|
|
// number of COI pipes created for the engine
|
|
long m_proc_number;
|
|
|
|
// process handle
|
|
COIPROCESS m_process;
|
|
|
|
// If false, device either has not been initialized or new libraries
|
|
// have been added.
|
|
bool m_ready;
|
|
mutex_t m_lock;
|
|
|
|
// List of libraries to be loaded
|
|
TargetImageList m_images;
|
|
|
|
// var tables
|
|
PtrDataTable m_ptr_set;
|
|
PtrDataTable m_targetptr_set;
|
|
|
|
// signals
|
|
SignalMap m_signal_map;
|
|
mutex_t m_signal_lock;
|
|
|
|
// streams
|
|
StreamMap m_stream_map;
|
|
mutex_t m_stream_lock;
|
|
int m_num_cores;
|
|
int m_num_threads;
|
|
std::bitset<COI_MAX_HW_THREADS> m_cpus;
|
|
|
|
// List of dynamic libraries to be registred
|
|
DynLibList m_dyn_libs;
|
|
|
|
// constants for accessing device function handles
|
|
enum {
|
|
c_func_compute = 0,
|
|
#ifdef MYO_SUPPORT
|
|
c_func_myo_init,
|
|
c_func_myo_fini,
|
|
#endif // MYO_SUPPORT
|
|
c_func_init,
|
|
c_func_var_table_size,
|
|
c_func_var_table_copy,
|
|
c_func_set_stream_affinity,
|
|
c_funcs_total
|
|
};
|
|
static const char* m_func_names[c_funcs_total];
|
|
|
|
// device function handles
|
|
COIFUNCTION m_funcs[c_funcs_total];
|
|
|
|
// int -> name mapping for device signals
|
|
static const int c_signal_max = 32;
|
|
static const char* c_signal_names[c_signal_max];
|
|
};
|
|
|
|
#endif // OFFLOAD_ENGINE_H_INCLUDED
|