| max_mac_ll.cpp.html | mathcode2html |
| Source file: max_mac_ll.cpp | |
| Converted: Fri Feb 3 2017 at 15:42:51 | |
| This documentation file will not reflect any later changes in the source file. |
$$\phantom{******** If you see this on the webpage then the
browser could not locate *********}$$
$$\phantom{******** jsMath/easy/load.js or the variable root
is set wrong in this file *********}$$
$$\newcommand{\vector}[1]{\left[\begin{array}{c} #1 \end{array}\right]}$$
$$\newenvironment{matrix}{\left[\begin{array}{cccccccccc}} {\end{array}\right]}$$
$$\newcommand{\A}{{\cal A}}$$
$$\newcommand{\W}{{\cal W}}$$
/* Copyright 2008-2016 Research Foundation State University of New York */ /* This file is part of QUB Express. */ /* QUB Express is free software; you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation, either version 3 of the License, or */ /* (at your option) any later version. */ /* QUB Express is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* You should have received a copy of the GNU General Public License, */ /* named LICENSE.txt, in the QUB Express program directory. If not, see */ /* <http://www.gnu.org/licenses/>. */ /*
|
|
See also: Up: Index |
*/
#ifdef _WIN32
#include <windows.h>
#else
#include <stdlib.h>
#define BOOL int
#define TRUE 1
#define FALSE 0
#endif
#include <string.h>
#include <iostream>
#include <fstream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <string.h>
#include <set>
#include <map>
#include <vector>
#include <algorithm>
#ifdef QUBX_MAC_OPENCL
#ifdef __APPLE__ // mac os x:
#include <OpenCL/opencl.h>
#else // linux/nvidia:
#include <CL/cl.h>
#endif
#endif
#include "max_mac_ll.h"
#include "../qubfast/max_ll_util.h"
#include "../qubfast/qubx_model_storage.h"
#include "../qubfast/ublas_matrixutil.h"
#include "../qubfast/callbk_reportstream.h"
#include "../qubfast/qub_constraints.h"
using std::cout;
using std::cerr;
using std::endl;
using std::set;
using std::map;
using std::pair;
using std::copy;
using std::min;
using std::max;
class qubx_mac_data;
class qubx_mac_param_storage;
#ifdef QUBX_MAC_OPENCL
#include "max_mac_ll.opencl"
//#include "max_mac_ll_64.opencl"
#define SMALLEST_CHUNK 8
#define OVERLOADING 16
#define QUBX_OCL_MAX_NSTATE 16
#define QUBX_OCL_MIN_NDATA 256
#define QUBX_4 0
#define QUBX_8 1
#define QUBX_16 2
int qubx_mac_opencl_setup_context(qubx_mac_data *data);
void qubx_mac_opencl_cleanup_context(qubx_mac_data *data);
int qubx_mac_opencl_setup_data(qubx_mac_data *data);
void qubx_mac_opencl_cleanup_data(qubx_mac_data *data);
int qubx_mac_opencl_setup_model(qubx_mac_param *p, qubx_mac_data *data);
void qubx_mac_opencl_cleanup_model(qubx_mac_param_storage *st);
int qubx_mac_opencl_setup_data_V(qubx_mac_data *data, qubx_mac_param *p);
int qubx_mac_opencl_setup_P0(qubx_mac_data *data, qubx_mac_param *p);
int qubx_mac_ll_opencl(qubx_mac_param *p);
#endif
int qubx_mac_ll_basic(qubx_mac_param *p);
int qubx_mac_ll_openmp(qubx_mac_param *p);
#ifdef _WIN32
#include "../qubfast/gettimeofday.h"
#else
#include <sys/time.h>
#endif
void print_timestamp(MAXMLL_VAR_NOT_USED
const char *lbl)
{
//struct timeval tm;
//gettimeofday(&tm, NULL);
//cerr << "--- " << tm.tv_sec << " : " << (tm.tv_usec / 1000) << "." << (tm.tv_usec % 1000) << " --- " << lbl << endl;
}
typedef map<std::vector<double>, int> stimmap;
void mplex_idl(int Nsignal, int *dwellCounts, int **classeses, float **durationses,
double **ampses, int *out_dwellCount, int *out_classes, float *out_durations,
stimmap& stimcls_of)
{
if ( ! Nsignal )
return;
// the key to stimmap
std::vector<double> amps(1+Nsignal);
// output
int &Nd = *out_dwellCount;
Nd = 0;
// per-signal stuff
std::vector<int> finger(Nsignal, 0);
std::vector<int> clss(Nsignal);
std::vector<float> remain(Nsignal);
for (int i=0; i<Nsignal; ++i) {
if ( ! dwellCounts[i] )
return;
clss[i] = classeses[i][0];
remain[i] = durationses[i][0];
if ( clss[i] >= 0 )
amps[i+1] = ampses[i][ clss[i] ];
else
amps[i+1] = 0.0; // gap at beginning of stimulus: should complain and fail; instead silently defaulting to zero
}
while ( 1 ) {
// until any signal ends
bool done = false;
for (int i=0; i<Nsignal; ++i)
if ( finger[i] >= dwellCounts[i] )
done = true;
if ( done )
break;
// find signal with shortest remaining
float tm = remain[0];
//int changing = 0;
for (int i=0; i<Nsignal; ++i) {
if ( remain[i] < tm ) {
tm = remain[i];
//changing = i;
}
}
// look up stimclass
stimmap::iterator sti = stimcls_of.find(amps);
if ( sti == stimcls_of.end() ) {
int sc = (int) stimcls_of.size();
stimcls_of[amps] = sc;
}
// append the event
out_classes[Nd] = stimcls_of[amps];
out_durations[Nd] = tm;
++Nd;
// shorten all signals remain, and move the finger for any that are 0
for (int i=0; i<Nsignal; ++i) {
remain[i] -= tm;
if ( remain[i] <= 0.0 ) {
if ( (++finger[i]) < dwellCounts[i] ) {
remain[i] = durationses[i][ finger[i] ];
clss[i] = classeses[i][ finger[i] ];
if ( clss[i] >= 0 )
amps[i+1] = ampses[i][ clss[i] ];
// else: gap: keep the last event's amp
}
}
}
}
}
class qubx_mac_data_piece
{
public:
int stimclass;
qubx_mac_sample *data; // pointer into mac_data->data, or NULL if count == 0
qubx_mac_sample_out *data_out; // pointer into mac_data->data_out, or NULL if count == 0
int count;
double dur;
qubx_mac_data_piece(int stimclass_, qubx_mac_sample *data_, qubx_mac_sample_out *data_out_, int count_, double dur_)
: stimclass(stimclass_), data(data_), data_out(data_out_), count(count_), dur(dur_)
{}
};
class qubx_mac_data_seg
{
public:
std::vector<qubx_mac_data_piece> pieces;
double dur;
qubx_mac_data_seg()
: dur(0.0)
{}
};
class qubx_mac_data
{
public:
int Nstim, Nsig, Ndata, data_cap, maxNpiece;
qubx_mac_sample *data;
qubx_mac_sample_out *data_out;
std::vector<qubx_mac_data_seg> segments;
std::vector<double> stimclasses;
stimmap stimcls_of;
std::vector<double> sc_frac;
#ifdef QUBX_MAC_OPENCL
cl_context ocl_context;
cl_device_id ocl_device;
cl_uint ocl_compunits;
cl_command_queue ocl_queue;
cl_program ocl_program, ocl64_program;
cl_kernel ocl_P0_kernel[3], ocl_point_kernel[3], ocl64_P0_kernel[3], ocl64_point_kernel[3];
cl_mem ocl_seg_npiece, ocl_seg_piece_dur, ocl_seg_piece_sc;
cl_mem ocl_samples, ocl_samples_out, ocl_fit_var_ll, ocl_workslices;
cl_mem ocl_seg_piece_P0, ocl_seg_piece_dV, ocl_seg_piece_Ileak;
int ocl_slicecount;
float *transfer;
int sent_v;
cl_mem ocl_dbg;
#endif
qubx_mac_data(int Nstimsig_, int data_cap_)
: Nstim(0), Nsig(Nstimsig_+1), Ndata(0), data_cap(data_cap_), maxNpiece(0)
, data(new qubx_mac_sample[data_cap_]), data_out(new qubx_mac_sample_out[data_cap_])
#ifdef QUBX_MAC_OPENCL
, ocl_context(NULL), ocl_device(0), ocl_compunits(0), ocl_queue(NULL), ocl_program(NULL), ocl64_program(NULL)
//, ocl_P0_kernel({NULL, NULL, NULL}), ocl_point_kernel({NULL, NULL, NULL})
//, ocl64_P0_kernel({NULL, NULL, NULL}), ocl64_point_kernel({NULL, NULL, NULL})
, ocl_seg_npiece(NULL), ocl_seg_piece_dur(NULL), ocl_seg_piece_sc(NULL)
, ocl_samples(NULL), ocl_samples_out(NULL), ocl_fit_var_ll(NULL), ocl_workslices(NULL)
, ocl_seg_piece_P0(NULL), ocl_seg_piece_dV(NULL), ocl_seg_piece_Ileak(NULL)
, ocl_slicecount(0), transfer(NULL), sent_v(0), ocl_dbg(NULL)
{
for (int i=0; i<3; ++i)
ocl_P0_kernel[i] = ocl_point_kernel[i] = ocl64_P0_kernel[i] = ocl64_point_kernel[i] = NULL;
}
#else
{}
#endif
virtual ~qubx_mac_data()
{
delete [] data;
delete [] data_out;
#ifdef QUBX_MAC_OPENCL
qubx_mac_opencl_cleanup_data(this);
qubx_mac_opencl_cleanup_context(this);
#endif
}
void reset(int Nstimsig, int newcap)
{
Nstim = Ndata = maxNpiece = 0;
Nsig = Nstimsig + 1;
segments.clear();
stimclasses.clear();
stimcls_of.clear();
ensure_capacity(newcap);
#ifdef QUBX_MAC_OPENCL
qubx_mac_opencl_cleanup_data(this);
#endif
}
void ensure_capacity(int newcap)
{
if ( data_cap < newcap ) {
while ( data_cap < newcap )
data_cap *= 2;
qubx_mac_sample *newdata = new qubx_mac_sample[data_cap];
qubx_mac_sample_out *newdata_out = new qubx_mac_sample_out[data_cap];
memcpy(newdata, data, Ndata*sizeof(qubx_mac_sample));
memcpy(newdata_out, data_out, Ndata*sizeof(qubx_mac_sample_out));
for (std::vector<qubx_mac_data_seg>::iterator segi=segments.begin(); segi!=segments.end(); ++segi) {
for (std::vector<qubx_mac_data_piece>::iterator piece = segi->pieces.begin(); piece != segi->pieces.end(); ++piece) {
if ( piece->data ) {
piece->data = newdata + (piece->data - data);
piece->data_out = newdata_out + (piece->data_out - data_out);
}
}
}
delete [] data;
delete [] data_out;
data = newdata;
data_out = newdata_out;
}
}
int append_segment()
{
int ix = (int) segments.size();
segments.resize(ix+1);
return ix;
}
void append_chunk(int seg_ix, double dur_sec, int Nd, float *T, float *I, float *V,
int *dwellCounts, int **classeses, float **durationses, double **ampses)
{
ensure_capacity(Ndata + Nd);
qubx_mac_sample *di = data + Ndata;
for (int i=0; i<Nd; ++i, ++di) {
di->time = T[i];
di->cur = I[i];
di->var = V ? V[i] : 0.0f;
}
qubx_mac_sample *chunk_data = Nd ? (data + Ndata) : NULL;
qubx_mac_sample_out *chunk_data_out = Nd ? (data_out + Ndata) : NULL;
qubx_mac_sample_out *dio = chunk_data_out;
Ndata += Nd;
if ( Nsig < 2 ) {
segments[seg_ix].pieces.push_back(qubx_mac_data_piece(0, chunk_data, chunk_data_out, Nd, dur_sec));
maxNpiece = max(maxNpiece, (int) segments[seg_ix].pieces.size());
return;
}
int max_mplex = 0;
for (int i=0; i<(Nsig-1); ++i)
max_mplex += dwellCounts[i];
std::vector<int> mplex_clss(max_mplex);
std::vector<float> mplex_durs(max_mplex);
int mplex_count = 0;
mplex_idl(Nsig-1, dwellCounts, classeses, durationses, ampses, &mplex_count, & mplex_clss[0], & mplex_durs[0], stimcls_of);
// adjust sample times to be relative to last stimulus transition:
di = chunk_data;
qubx_mac_sample *di_end = data + Ndata;
double& seg_dur = segments[seg_ix].dur;
for (int i=0; i<mplex_count; ++i) {
float prev_dur = (float) seg_dur;
seg_dur += 1e-3 * mplex_durs[i];
qubx_mac_sample *piece_data = di;
qubx_mac_sample_out *piece_out = dio;
while ( di && (di != di_end) && (di->time < seg_dur) ) {
di->time -= prev_dur;
++di;
++dio;
}
segments[seg_ix].pieces.push_back(qubx_mac_data_piece(mplex_clss[i],
(di != piece_data) ? piece_data : NULL,
(dio != piece_out) ? piece_out : NULL,
(int) (di - piece_data),
1e-3 * mplex_durs[i]));
}
maxNpiece = max(maxNpiece, (int) segments[seg_ix].pieces.size());
}
double* get_stimclasses()
{
Nstim = (int) stimcls_of.size();
if ( ! Nstim )
Nstim = 1;
stimclasses.resize(Nstim*Nsig);
for (stimmap::iterator stimi = stimcls_of.begin(); stimi != stimcls_of.end(); ++stimi) {
int i = stimi->second * Nsig;
copy(stimi->first.begin(), stimi->first.end(), & stimclasses[i]);
}
sc_frac.resize(Nstim);
memset(& sc_frac[0], 0, Nstim*sizeof(double));
int total = 0;
for (std::vector<qubx_mac_data_seg>::iterator segi=segments.begin(); segi!=segments.end(); ++segi) {
for (std::vector<qubx_mac_data_piece>::iterator piece = segi->pieces.begin(); piece != segi->pieces.end(); ++piece) {
if ( piece->data ) {
total += piece->count;
sc_frac[piece->stimclass] += piece->count;
}
}
}
if ( total )
for (int i=0; i<Nstim; ++i)
sc_frac[i] /= total;
return & stimclasses[0];
}
};
extern "C" MAXMLL_API void* __stdcall qubx_mac_data_create(int Nstimsig, int data_cap)
{
return new qubx_mac_data(Nstimsig, data_cap);
}
extern "C" MAXMLL_API void __stdcall qubx_mac_data_free(void *data)
{
delete (qubx_mac_data *) data;
}
MAXMLL_API void __stdcall qubx_mac_data_reset(void *data, int Nstimsig, int data_cap)
{
((qubx_mac_data *) data)->reset(Nstimsig, data_cap);
}
extern "C" MAXMLL_API int __stdcall qubx_mac_data_append_segment(void *data)
{
return ((qubx_mac_data *) data)->append_segment();
}
extern "C" MAXMLL_API void __stdcall qubx_mac_data_append_chunk(void *data, int seg_ix, double dur_sec, int Nd,
float *T, float *I, float *V, int *dwellCounts,
int **classeses, float **durationses, double **ampses)
{
((qubx_mac_data *) data)->append_chunk(seg_ix, dur_sec, Nd, T, I, V, dwellCounts, classeses, durationses, ampses);
}
extern "C" MAXMLL_API double* __stdcall qubx_mac_data_get_stimclasses(void *data, int *Nstimclass)
// returns double[Nstimclass * Nsignal], Nsignal = Nstimsig + 1;
{
qubx_mac_data *macdata = (qubx_mac_data *) data;
double *rtn = macdata->get_stimclasses();
*Nstimclass = macdata->Nstim;
return rtn;
}
extern "C" MAXMLL_API double* __stdcall qubx_mac_data_get_stimclass_frac(void *data)
{
qubx_mac_data *macdata = (qubx_mac_data *) data;
return & macdata->sc_frac[0];
}
extern "C" MAXMLL_API qubx_mac_sample_out* __stdcall qubx_mac_data_get_buffer(void *data)
{
qubx_mac_data *macdata = (qubx_mac_data *) data;
return macdata->data_out;
}
void copy_sub_hess(double *sub, int N, double *full, int x0, int y0, int xn, int yn)
{
for (int i=0; i<xn; ++i) {
for (int j=0; j<yn; ++j) {
*(sub++) = full[(x0+i)*N + y0+j];
}
}
}
class qubx_mac_param_model {
public:
int Npar, NfPar;
qubx_model *model;
qubx_stim_amps *ampm;
qubx_stim_rates *ratesm;
pointy_col_vector<double> xamp, xvar;
double scale_Nchannel;
double float_Nchannel;
pointy_col_vector<double> pars, fPars;
void init(qubx_model *model_, qubx_stim_amps *ampm_, qubx_stim_rates *ratesm_)
{
model = model_;
ampm = ampm_;
ratesm = ratesm_;
xamp.resize(model_->Nstate);
xvar.resize(model_->Nstate);
scale_Nchannel = 1.0;
float_Nchannel = model->Nchannel;
scale_Nchannel = log(float_Nchannel);
}
void setup_xampvar()
{
if ( model->IeqFv ) {
for (int s=model->Nstate-1; s>=0; --s) {
xvar.p[s] = model->var[model->clazz[s]];
xamp.p[s] = model->amp[model->clazz[s]];
}
} else {
for (int s=model->Nstate-1; s>=0; --s) {
xvar.p[s] = max(0.0, model->var[model->clazz[s]] - model->var[0]);
xamp.p[s] = model->amp[model->clazz[s]] - model->amp[0];
}
}
}
void setup_constraints(int optNchannel)
{
if ( optNchannel )
optNchannel = 1;
qubx_constrained_setup(ampm->cns);
qubx_stim_rates_setup_constraints(ratesm);
Npar = ampm->cns->Npar + ratesm->cns->Npar + optNchannel;
NfPar = ampm->cns->NfPar + ratesm->cns->NfPar + optNchannel;
pars.resize(Npar);
memcpy(pars.p, ratesm->cns->pars, ratesm->cns->Npar*sizeof(double));
fPars.resize(NfPar);
memcpy(fPars.p, ratesm->cns->fPars, ratesm->cns->NfPar*sizeof(double));
memcpy(pars.p+ratesm->cns->Npar, ampm->cns->pars, ampm->cns->Npar*sizeof(double));
memcpy(fPars.p+ratesm->cns->NfPar, ampm->cns->fPars, ampm->cns->NfPar*sizeof(double));
if ( optNchannel ) {
pars.p[Npar-1] = fPars.p[NfPar-1] = 1.0;
}
float_Nchannel = model->Nchannel;
scale_Nchannel = log(float_Nchannel);
}
void setup_nchannel() {
float_Nchannel = model->Nchannel;
scale_Nchannel = log(float_Nchannel);
}
void do_fPars_to_model(int optNchannel) {
if ( ratesm->cns->NfPar ) {
memcpy(ratesm->cns->fPars, fPars.p, ratesm->cns->NfPar*sizeof(double));
qubx_constrained_free_to_pars(ratesm->cns);
qubx_stim_rates_pars_to_model(ratesm);
}
if ( ampm->cns->NfPar ) {
memcpy(ampm->cns->fPars, fPars.p+ratesm->cns->NfPar, ampm->cns->NfPar*sizeof(double));
qubx_constrained_free_to_pars(ampm->cns);
qubx_stim_amps_pars_to_model(ampm);
}
if ( optNchannel ) {
float_Nchannel = max(1.0, exp(scale_Nchannel * fPars.p[NfPar-1]));
model->Nchannel = (int) (float_Nchannel + 0.5);
}
}
void calc_std(double *InvHessian, double *pars_all, int NfPar_all, int iPar, int ifPar, int optNchannel)
{
std::vector<double> subhess(NfPar * NfPar);
if ( ratesm->cns->NfPar ) {
copy_sub_hess(&subhess[0], NfPar_all, InvHessian, ifPar, ifPar, ratesm->cns->NfPar, ratesm->cns->NfPar);
qubx_stim_rates_calc_std(ratesm, &subhess[0]);
}
if ( ampm->cns->NfPar ) {
copy_sub_hess(&subhess[0], NfPar_all, InvHessian, ifPar+ratesm->cns->NfPar, ifPar+ratesm->cns->NfPar, ampm->cns->NfPar, ampm->cns->NfPar);
qubx_stim_amps_calc_std(ampm, &subhess[0]);
}
if ( optNchannel ) {
model->d_Nchannel = sqrt(scale_Nchannel * scale_Nchannel * InvHessian[(NfPar_all+1)*(ifPar+NfPar-1)]) * exp(pars_all[iPar+Npar-1]);
}
}
};
class qubx_mac_param_storage {
public:
std::vector<qubx_mac_param_model> models;
int &Npar, &NfPar; // same as param->Npar, param->NfPar
pointy_col_vector<double> pars, fPars;
int iModel;
#ifdef QUBX_MAC_OPENCL
float *transfer;
cl_mem ocl_QQ_eig, ocl_QQ_evec, ocl_QQ_einv, ocl_xamp, ocl_xvar;
float ocl_amp0, ocl_var0;
int ocl_dim;
#endif
qubx_mac_param_storage(int Nmodel, int& Npar_, int& NfPar_)
: models(Nmodel), Npar(Npar_), NfPar(NfPar_), iModel(0)
#ifdef QUBX_MAC_OPENCL
, transfer(NULL), ocl_QQ_eig(NULL), ocl_QQ_evec(NULL), ocl_QQ_einv(NULL), ocl_xamp(NULL), ocl_xvar(NULL)
#endif
{}
#ifdef QUBX_MAC_OPENCL
virtual ~qubx_mac_param_storage()
{
qubx_mac_opencl_cleanup_model(this);
}
#endif
void add_model(qubx_model *model, qubx_stim_amps *ampm, qubx_stim_rates *ratesm)
{
models[iModel++].init(model, ampm, ratesm);
#ifdef QUBX_MAC_OPENCL
if ( models.size() == 1 ) {
if ( model->Nstate <= 4 )
ocl_dim = 4;
else if ( model->Nstate <= 8 )
ocl_dim = 8;
else
ocl_dim = 16;
}
#endif
}
void setup_xampvar()
{
for (int i=(int)models.size()-1; i>=0; --i)
models[i].setup_xampvar();
}
int setup_constraints(qubx_mac_param *p)
{
setup_xampvar();
Npar = NfPar = 0;
for (int i=(int)models.size()-1; i>=0; --i) {
models[i].setup_constraints(p->optNchannel);
Npar += models[i].Npar;
NfPar += models[i].NfPar;
}
pars.resize(Npar);
fPars.resize(NfPar);
p->pars = pars.p;
p->fPars = fPars.p;
int iPar = 0;
int ifPar = 0;
for (size_t i=0; i<models.size(); ++i) {
memcpy(pars.p+iPar, models[i].pars.p, models[i].Npar*sizeof(double));
memcpy(fPars.p+ifPar, models[i].fPars.p, models[i].NfPar*sizeof(double));
iPar += models[i].Npar;
ifPar += models[i].NfPar;
}
return 0;
}
void setup_nchannel() {
for (size_t i=0; i<models.size(); ++i) {
models[i].setup_nchannel();
}
}
void do_fPars_to_model(int optNchannel)
{
int ifPar = 0;
for (size_t i=0; i<models.size(); ++i) {
memcpy(models[i].fPars.p, fPars.p+ifPar, models[i].NfPar*sizeof(double));
models[i].do_fPars_to_model(optNchannel);
ifPar += models[i].NfPar;
}
setup_xampvar();
}
void calc_std(qubx_mac_param *p, double *InvHessian)
{
int iPar = 0;
int ifPar = 0;
for (size_t i=0; i<models.size(); ++i) {
models[i].calc_std(InvHessian, pars.p, p->NfPar, iPar, ifPar, p->optNchannel);
iPar += models[i].Npar;
ifPar += models[i].NfPar;
}
}
};
extern "C" MAXMLL_API qubx_mac_param* __stdcall qubx_mac_param_create(int Nmodel)
{
qubx_mac_param *p = new qubx_mac_param;
qubx_mac_param_storage *st = new qubx_mac_param_storage(Nmodel, p->Npar, p->NfPar);
p->storage = st;
p->useVar = 1;
p->optNchannel = 1;
p->accel = 10;
p->Npar = p->NfPar = 0;
p->ll = 0.0;
p->gpu_fail = 0;
p->float_Nchannel = 0.0;
return p;
}
extern "C" MAXMLL_API void __stdcall qubx_mac_param_free(qubx_mac_param *p)
{
delete (qubx_mac_param_storage*) p->storage;
delete p;
}
extern "C" MAXMLL_API void __stdcall qubx_mac_param_add_model(qubx_mac_param *p, qubx_model *model, qubx_stim_amps *ampm, qubx_stim_rates *ratesm)
{
qubx_mac_param_storage *st = (qubx_mac_param_storage*) p->storage;
st->add_model(model, ampm, ratesm);
}
extern "C" MAXMLL_API int __stdcall qubx_mac_setup_constraints(qubx_mac_param *p)
{
qubx_mac_param_storage *st = (qubx_mac_param_storage*) p->storage;
return st->setup_constraints(p);
}
extern "C" MAXMLL_API void __stdcall qubx_mac_do_fPars_to_model(qubx_mac_param *p)
{
qubx_mac_param_storage *ps = (qubx_mac_param_storage*) p->storage;
ps->do_fPars_to_model(p->optNchannel);
p->float_Nchannel = 0.0; // meaning: use the individual models' float_Nchannel unless it gets changed
}
extern "C" MAXMLL_API void __stdcall qubx_mac_calc_std(qubx_mac_param *p, double *InvHessian)
{
qubx_mac_param_storage *ps = (qubx_mac_param_storage *) p->storage;
ps->calc_std(p, InvHessian);
}
class qubx_mac_workspace {
public:
qubx_mac_param *p;
qubx_mac_param_model *pm;
qubx_mac_param_storage *st;
qubx_model *m;
qubx_stim_rates_storage *rst;
matrix<double> A;
matrix<double> P0, Pt;
matrix<double> varPt;
matrix<double> tmp_m, tmp_v, tmp_s;
matrix<double> tmp_diag;
int average_count; // how many traces were averaged together (unimplemented == 1)
void init(qubx_mac_param *param, qubx_mac_param_model *pmodel)
{
p = param;
pm = pmodel;
st = (qubx_mac_param_storage *) param->storage;
m = pm->model;
rst = (qubx_stim_rates_storage *) pm->ratesm->storage;
A.resize(m->Nstate, m->Nstate);
P0.resize(1, m->Nstate);
Pt.resize(1, m->Nstate);
varPt.resize(m->Nstate, m->Nstate);
tmp_m.resize(m->Nstate, m->Nstate);
tmp_v.resize(1, m->Nstate);
tmp_s.resize(1, 1);
tmp_diag.resize(m->Nstate, m->Nstate);
average_count = 1;
tmp_diag.clear();
if ( p->float_Nchannel > 1e-3 )
pm->float_Nchannel = p->float_Nchannel; // override from multi-datafile fit
}
matrix<double>& do_A(int sc, double t)
{
rst->setup_A(sc, t, tmp_diag, tmp_m, A);
//cerr << "Q:" << endl << ((qubx_stim_rates_storage *) p->ratesm->storage)->QQ[sc].m << endl;
//cerr << "A:" << endl << A << endl;
return A;
}
void do_Pt(int sc, double t)
{
noalias(Pt) = prod(P0, do_A(sc, t));
}
matrix<double>& do_P0(int sc)
{
if ( m->usePeq ) {
QtoPe(rst->QQ[sc].m, P0);
} else {
memcpy(& P0(0,0), m->P0, m->Nstate * sizeof(double));
}
return P0;
}
void advance_P0(int sc, double t)
{
P0 = prod(P0, do_A(sc, t));
}
double do_mean_t(double mV)
{
noalias(tmp_s) = prod(Pt, pm->xamp.m);
double mt = pm->float_Nchannel * tmp_s(0,0);
if ( m->IeqFv ) {
mt = ((mV - 1e-3*m->Vrev)*mt + (mV - 1e-3*m->Vleak)*m->Gleak);
}
return ( m->amp0 + mt );
}
void do_var_Pt() // state variance
{
for (int i=0; i<m->Nstate; ++i)
for (int j=0; j<m->Nstate; ++j)
varPt(i,j) = (i == j)
? (Pt(0,i) * (1.0 - Pt(0,i)))
: (- Pt(0,i) * Pt(0,j));
}
double do_var_t(double dV) // total variance
{
noalias(tmp_v) = prod(trans(pm->xamp.m), varPt);
noalias(tmp_s) = prod(tmp_v, pm->xamp.m);
double v = tmp_s(0,0);
noalias(tmp_s) = prod(Pt, pm->xvar.m);
v += tmp_s(0,0);
if ( m->IeqFv )
v *= dV * dV;
v = (m->var0 + pm->float_Nchannel*v) / average_count; ////
if ( v <= 0.0 )
v = 1e-30;
return ( v );
}
};
extern "C" MAXMLL_API int __stdcall qubx_mac_ll(qubx_mac_param *p)
{
int rtn;
qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage;
qubx_model *model = st->models[0].model;
if ( p->accel > 1 )
#ifdef QUBX_MAC_OPENCL
if ( (model->Nstate <= QUBX_OCL_MAX_NSTATE)
&& (((qubx_mac_data*) p->data)->Ndata >= QUBX_OCL_MIN_NDATA)
&& (st->models.size() == 1) ) {
if ( ! (rtn = qubx_mac_ll_opencl(p)) )
return 0;
else
p->gpu_fail = 1;
}
#else
p->gpu_fail = 1;
#endif
if ( p->accel > 0 )
if ( ! (rtn = qubx_mac_ll_openmp(p)) )
return 0;
return qubx_mac_ll_basic(p);
}
int qubx_mac_ll_basic(qubx_mac_param *p)
{
print_timestamp("basic");
qubx_mac_data *macdata = (qubx_mac_data *) p->data;
qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage;
int& stop_flag = st->models[0].model->stop_flag;
p->ll = 0.0;
double logl = 0.0;
double log2pi = log(2.0*M_PI);
int Nmodel = (int) st->models.size();
{
std::vector<qubx_mac_workspace> ws(Nmodel); // per omp thread
for (int m=0; m<Nmodel; ++m)
ws[m].init(p, & st->models[m]);
for (size_t s=0; s < macdata->segments.size(); ++s) {
for (int m=Nmodel-1; m>=0; --m)
ws[m].do_P0(macdata->segments[s].pieces[0].stimclass);
double seg_ll = 0.0;
int total_points = 0;
if ( (! stop_flag) && macdata->segments[s].pieces.size() ) {
double dV = 0.0;
double mV = 1.0;
double piece_time = 0.0;
std::vector<qubx_mac_data_piece>& pieces = macdata->segments[s].pieces;
int npiece = (int) pieces.size();
for (int ip=0; ip<npiece; ++ip) {
qubx_mac_data_piece *piece = &(pieces[ip]);
if ( ! stop_flag ) {
if ( ws[0].pm->model->IeqFv ) { // update dV
mV = 1e-3 * ws[0].pm->ampm->stimclasses[piece->stimclass * ws[0].pm->ampm->Nsig + ws[0].pm->model->iVoltage];
dV = mV - 1e-3 * ws[0].pm->model->Vrev;
}
for (int isam = 0; isam < piece->count; ++isam) {
qubx_mac_sample *sample = piece->data + isam;
qubx_mac_sample_out *sample_out = piece->data_out + isam;
double mean = 0.0;
double var = 0.0;
for (int m=Nmodel-1; m>=0; --m) {
ws[m].do_Pt(piece->stimclass, sample->time);
mean += ws[m].do_mean_t(mV);
if ( p->useVar ) {
ws[m].do_var_Pt();
var += ws[m].do_var_t(dV);
}
}
sample_out->fit = (float) mean;
double ss = sample->cur - mean;
ss *= ss;
if ( p->useVar ) {
//if ( (p == 0) && (isam == 1) )
//cerr << "varPt:" << endl << ws.varPt << endl;
sample_out->fit_var = (float) var;
if ( var )
ss = log(var) + ss/var;
sample_out->ll = (float) (-0.5 * (ss + log2pi));
} else {
sample_out->ll = (float) (-ss);
}
seg_ll += ss;
++total_points;
}
piece_time += piece->dur;
}
for (int m=Nmodel-1; m>=0; --m)
ws[m].advance_P0(piece->stimclass, piece->dur);
}
}
logl -= seg_ll;
if ( p->useVar ) {
logl -= total_points * log2pi;
}
}
}
if ( p->useVar ) {
logl *= 0.5;
}
p->ll = logl;
if ( stop_flag ) {
return -8; // stopped
}
print_timestamp("end basic");
return 0; // success
}
int qubx_mac_ll_openmp(qubx_mac_param *p)
{
print_timestamp("start openmp");
qubx_mac_data *macdata = (qubx_mac_data *) p->data;
qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage;
p->ll = 0.0;
double logl = 0.0;
double log2pi = log(2.0*M_PI);
int Nmodel = (int) st->models.size();
int& stop_flag = st->models[0].model->stop_flag;
int stop_read;
#pragma omp parallel reduction (+: logl) shared(stop_read)
{
std::vector<qubx_mac_workspace> ws(Nmodel); // per omp thread
for (int m=0; m<Nmodel; ++m)
ws[m].init(p, & st->models[m]);
for (size_t s=0; s < macdata->segments.size(); ++s) {
for (int m=0; m<Nmodel; ++m)
ws[m].do_P0(macdata->segments[s].pieces[0].stimclass);
double seg_ll = 0.0;
int total_points = 0;
if ( macdata->segments[s].pieces.size() ) {
double dV = 0.0;
double mV = 1.0;
double piece_time = 0.0;
std::vector<qubx_mac_data_piece>& pieces = macdata->segments[s].pieces;
int npiece = (int) pieces.size();
for (int ip=0; ip<npiece; ++ip) {
qubx_mac_data_piece *piece = &(pieces[ip]);
#pragma omp single
{
stop_read = stop_flag;
}
if ( ! stop_read ) {
if ( ws[0].pm->model->IeqFv ) { // update dV
mV = 1e-3 * ws[0].pm->ampm->stimclasses[piece->stimclass * ws[0].pm->ampm->Nsig + ws[0].pm->model->iVoltage];
dV = mV - 1e-3 * ws[0].pm->model->Vrev;
}
#pragma omp for
for (int isam = 0; isam < piece->count; ++isam) {
qubx_mac_sample *sample = piece->data + isam;
qubx_mac_sample_out *sample_out = piece->data_out + isam;
double mean = 0.0;
double var = 0.0;
for (int m=Nmodel-1; m>=0; --m) {
ws[m].do_Pt(piece->stimclass, sample->time);
mean += ws[m].do_mean_t(mV);
if ( p->useVar ) {
ws[m].do_var_Pt();
var += ws[m].do_var_t(dV);
}
}
sample_out->fit = (float) mean;
double ss = sample->cur - mean;
ss *= ss;
if ( p->useVar ) {
sample_out->fit_var = (float) var;
if ( var )
ss = log(var) + ss/var;
sample_out->ll = (float) (-0.5 * (ss + log2pi));
} else {
sample_out->ll = (float) (-ss);
}
seg_ll += ss;
++total_points;
}
piece_time += piece->dur;
}
for (int m=0; m<Nmodel; ++m) {
ws[m].advance_P0(piece->stimclass, piece->dur);
}
}
}
logl -= seg_ll;
if ( p->useVar ) {
logl -= total_points * log2pi;
}
}
}
if ( p->useVar ) {
logl *= 0.5;
}
p->ll = logl;
if ( stop_flag ) {
return -8; // stopped
}
print_timestamp("end openmp");
return 0; // success
}
/*
as markup on rates.data to remain on card:
mac_opencl_data_create(data, model):
global: data[Ndata] maybe re-use signal variance as point_ll?
global: seg_npiece[Nseg]
global: seg_piece_dur[Nseg*maxNpiece]
global: seg_piece_sc[Nseg*maxNpiece]
// pad to 8 or 16 states? if i>= or j>= return immediately?
per run:
mac_opencl_create(data, model):
global: QQ_eig, QQ_eigenvec, QQ_eiginv [Ns*Ns*Nsc]
global: xamp, xvar, amp0, var0, Nchannel
global: seg_piece_P0[Nseg*maxNpiece*Ns]
global: seg_piece_dV, seg_piece_Ileak
fill each seg_piece_P0[seg][0]
mac_p0_kernel<1>(Nseg)
[i] shared: P0 of seg start
each piece[:-1]:
[ij] shared: Q_eigenvec[sc], Q_eiginv[sc]
[i] shared: eigx = Q_eig[sc] * x
----
[ij]k shared: Pt_i = P0_j * vec_jk * eigx_k * inv_ki
----
[i] {store Pt_i in seg_piece_P0[seg][piece+1]
{P0_i = Pt_i
----
nwork(chunksize) = sum(ceil(piece_size / chunksize))
chunksize = first power of 2 | (nwork == nseg) or (nwork < 8*num_multiprocessors)
workpieces = [(seg, piece, t0, n)]
mac_point_kernel<Ns, Ns>(nwork)
determine seg, piece, t-range
all: sc
[ij] shared: Q_eigenvec[sc], Q_eiginv[sc]
[i] shared: xamp, xvar, Q_eig[sc], P0
shared: amp0, var0
----
each t:
[] all: data.x
[ij] shared: tmp_ij = x * eig_i * inv_ij
----
[ij]k shared: Pt_i = P0_j * vec_jk * tmp_ki
----
[ij] varPt_ij = (i==j) ? Pt[i] * (1-Pt[i])
: - Pt[i] * Pt[j]
[i] tmp_i = varPt_ik * xamp_k
[]k {data.mean = sum_k(Pt_k * xamp_k)
[] {data.mean = data.mean * Nchannel * dV + Ileak + amp0
----
[]k {data.var = sum_k(xamp_k * tmp_k + Pt_k * xvar_k)
{ * dV
{ * Nchannel + var0 // all divided by averagecount
[] {point_ll = (y - mean)**2
{ =? point_ll / data.var + log(data.var)
get data from device; sum point_ll
data holds context or NULLs:
context and whatever
compiled kernels
non-model-spec. global pointers
work-pieces
separate qubx_mac_ll_openmp;
front qubx_mac_ll tries opencl_8 (auto 16 etc...) then _openmp
conditional on opencl flag: implement or auto-fail:
qubx_mac_ll_opencl_8:
setup data.context etc if NULL
setup model-spec. pointers
compute seg P0
call P0 kernel
call point kernel
sum point LL (and seg LLs?)
free model-spec. pointers
*/
#ifdef QUBX_MAC_OPENCL
class qubx_mac_workslice
{
public:
int seg, piece, offset, count;
qubx_mac_workslice(int seg_, int piece_, int offset_, int count_)
: seg(seg_), piece(piece_), offset(offset_), count(count_)
{}
};
int qubx_mac_ll_opencl(qubx_mac_param *p)
{
qubx_mac_data *dat = (qubx_mac_data *) p->data;
qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage;
qubx_model *model = st->models[0].model;
int dim = st->ocl_dim;
int kern_ix = ((dim == 4) ? QUBX_4
: ((dim == 8) ? QUBX_8
: QUBX_16));
print_timestamp("begin opencl");
int rtn = 0;
if ( (rtn = qubx_mac_opencl_setup_context(dat)) )
return rtn;
print_timestamp("have context");
if ( (rtn = qubx_mac_opencl_setup_data(dat)) )
return rtn;
print_timestamp("sent data");
if ( (rtn = qubx_mac_opencl_setup_data_V(dat, p)) )
return rtn;
print_timestamp("setup data V");
if ( (rtn = qubx_mac_opencl_setup_model(p, dat)) )
return rtn;
print_timestamp("setup model");
if ( (rtn = qubx_mac_opencl_setup_P0(dat, p)) )
return rtn;
print_timestamp("setup init. P0");
size_t local_dim[2] = {(size_t) dim, (size_t) dim};
size_t global_dim[2] = {(size_t) dim, (size_t) dim};
global_dim[0] = dim*dat->segments.size();
cl_kernel P0_kernel = dat->ocl64_P0_kernel[kern_ix] ? dat->ocl64_P0_kernel[kern_ix] : dat->ocl_P0_kernel[kern_ix];
cl_kernel point_kernel = dat->ocl64_point_kernel[kern_ix] ? dat->ocl64_point_kernel[kern_ix] : dat->ocl_point_kernel[kern_ix];
if ( (rtn = clSetKernelArg(P0_kernel, 0, sizeof(int), & model->Nstate)) ) {
cerr << "clSetKernelArg(P0, 0): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 1, sizeof(int), & dat->maxNpiece)) ) {
cerr << "clSetKernelArg(P0, 1): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 2, sizeof(int), & dat->Nstim)) ) {
cerr << "clSetKernelArg(P0, 2): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 3, sizeof(cl_mem), & st->ocl_QQ_eig)) ) {
cerr << "clSetKernelArg(P0, 3): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 4, sizeof(cl_mem), & st->ocl_QQ_evec)) ) {
cerr << "clSetKernelArg(P0, 4): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 5, sizeof(cl_mem), & st->ocl_QQ_einv)) ) {
cerr << "clSetKernelArg(P0, 5): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 6, sizeof(cl_mem), & dat->ocl_seg_npiece)) ) {
cerr << "clSetKernelArg(P0, 6): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 7, sizeof(cl_mem), & dat->ocl_seg_piece_sc)) ) {
cerr << "clSetKernelArg(P0, 7): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 8, sizeof(cl_mem), & dat->ocl_seg_piece_dur)) ) {
cerr << "clSetKernelArg(P0, 8): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(P0_kernel, 9, sizeof(cl_mem), & dat->ocl_seg_piece_P0)) ) {
cerr << "clSetKernelArg(P0, 9): error " << rtn << endl;
return rtn;
}
print_timestamp("start P0 kernel");
if ( (rtn = clEnqueueNDRangeKernel(dat->ocl_queue, P0_kernel, 2, 0,
(const size_t *) &global_dim, (const size_t *) &local_dim, 0, NULL, NULL)) ) {
cerr << "clEnqueueNDRangeKernel(ocl_P0_kernel): error " << rtn << endl;
return rtn;
}
print_timestamp("end P0 kernel");
global_dim[0] = dim*dat->ocl_slicecount;
if ( (rtn = clSetKernelArg(point_kernel, 0, sizeof(int), & model->Nstate)) ) {
cerr << "clSetKernelArg(point, 0): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 1, sizeof(int), & dat->maxNpiece)) ) {
cerr << "clSetKernelArg(point, 1): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 2, sizeof(int), & dat->Nstim)) ) {
cerr << "clSetKernelArg(point, 2): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 3, sizeof(int), & model->Nchannel)) ) {
cerr << "clSetKernelArg(point, 3): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 4, sizeof(cl_mem), & st->ocl_QQ_eig)) ) {
cerr << "clSetKernelArg(point, 4): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 5, sizeof(cl_mem), & st->ocl_QQ_evec)) ) {
cerr << "clSetKernelArg(point, 5): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 6, sizeof(cl_mem), & st->ocl_QQ_einv)) ) {
cerr << "clSetKernelArg(point, 6): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 7, sizeof(cl_mem), & dat->ocl_seg_npiece)) ) {
cerr << "clSetKernelArg(point, 7): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 8, sizeof(cl_mem), & dat->ocl_seg_piece_sc)) ) {
cerr << "clSetKernelArg(point, 8): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 9, sizeof(cl_mem), & dat->ocl_seg_piece_dur)) ) {
cerr << "clSetKernelArg(point, 9): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 10, sizeof(cl_mem), & dat->ocl_seg_piece_P0)) ) {
cerr << "clSetKernelArg(point, 10): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 11, sizeof(cl_mem), & dat->ocl_seg_piece_dV)) ) {
cerr << "clSetKernelArg(point, 11): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 12, sizeof(cl_mem), & dat->ocl_seg_piece_Ileak)) ) {
cerr << "clSetKernelArg(point, 12): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 13, sizeof(cl_mem), & st->ocl_xamp)) ) {
cerr << "clSetKernelArg(point, 13): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 14, sizeof(cl_mem), & st->ocl_xvar)) ) {
cerr << "clSetKernelArg(point, 14): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 15, sizeof(cl_mem), & dat->ocl_workslices)) ) {
cerr << "clSetKernelArg(point, 15): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 16, sizeof(cl_mem), & dat->ocl_samples)) ) {
cerr << "clSetKernelArg(point, 16): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 17, sizeof(cl_mem), & dat->ocl_samples_out)) ) {
cerr << "clSetKernelArg(point, 17): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 18, sizeof(float), & st->ocl_amp0)) ) {
cerr << "clSetKernelArg(point, 18): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 19, sizeof(float), & st->ocl_var0)) ) {
cerr << "clSetKernelArg(point, 19): error " << rtn << endl;
return rtn;
}
if ( (rtn = clSetKernelArg(point_kernel, 20, sizeof(cl_mem), & dat->ocl_dbg)) ) {
cerr << "clSetKernelArg(point, 20): error " << rtn << endl;
return rtn;
}
print_timestamp("start point kernel");
if ( (rtn = clEnqueueNDRangeKernel(dat->ocl_queue, point_kernel, 2, 0,
(const size_t *) &global_dim, (const size_t *) &local_dim, 0, NULL, NULL)) ) {
cerr << "clEnqueueNDRangeKernel(ocl_point_kernel): error " << rtn << endl;
return rtn;
}
print_timestamp("end point kernel");
/*
if ( (rtn = clEnqueueReadBuffer(dat->ocl_queue, dat->ocl_dbg, CL_TRUE, 0,
9*8*sizeof(float), st->transfer, 0, NULL, NULL)) ) {
cerr << "clEnqueueReadBuffer(ocl_dbg): error " << rtn << endl;
return rtn;
}
for (int i=0; i<9; ++i) {
for (int j=0; j<8; ++j) {
cerr << st->transfer[8*i+j] << '\t';
}
cerr << endl;
}
*/
double acc = 0.0;
if ( (rtn = clEnqueueReadBuffer(dat->ocl_queue, dat->ocl_samples_out, CL_TRUE, 0,
dat->Ndata*sizeof(qubx_mac_sample_out), dat->data_out, 0, NULL, NULL)) ) {
cerr << "clEnqueueReadBuffer(ocl_samples_out): error " << rtn << endl;
return rtn;
}
print_timestamp("read data");
qubx_mac_sample_out *result = dat->data_out;
double log2pi = log(2.0*M_PI);
#pragma omp parallel for reduction (+: acc)
for (int i=0; i<dat->Ndata; ++i) {
double pointSS = result[i].ll;
acc += pointSS;
result[i].ll = (float) (-0.5 * (pointSS + log2pi));
}
print_timestamp("summed ll");
// if not usevar: *ll = - acc;
p->ll = - 0.5 * (acc + dat->Ndata * log2pi);
return 0; // success
}
int qubx_mac_opencl_setup_context(qubx_mac_data *data)
{
if ( data->ocl_context )
return 0; //already done
cl_platform_id plat_ids[3];
cl_device_id dev_ids[3];
cl_context_properties props[3];
cl_uint n, ndesc;
cl_platform_id p_max = 0;
cl_device_id d_max = 0;
cl_uint n_max = 0;
cl_uint compunits;
int errcode = 0;
if ( (errcode = clGetPlatformIDs(3, plat_ids, &n)) ) {
cerr << "clGetPlatformIDs: error " << errcode << endl;
goto RTN_FAIL;
}
for (cl_uint i = 0; i < n; i++) {
if ( (errcode = clGetDeviceIDs(plat_ids[i], CL_DEVICE_TYPE_GPU, 3*sizeof(cl_device_id), dev_ids, &ndesc)) ) {
cerr << "clGetDeviceIDs(platform " << i << "): error " << errcode << endl;
goto RTN_FAIL;
}
for (size_t d=0; d<ndesc; ++d) {
if ( (errcode = clGetDeviceInfo(dev_ids[d], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &compunits, NULL)) ) {
cerr << "clGetDeviceInfo: error " << errcode << endl;
goto RTN_FAIL;
}
//cerr << "Platform " << i << ", device " << d << ": " << compunits << " comp units" << endl;
if ( compunits > n_max ) {
n_max = compunits;
d_max = dev_ids[d];
p_max = plat_ids[i];
}
}
}
data->ocl_device = d_max;
data->ocl_compunits = n_max;
props[0] = CL_CONTEXT_PLATFORM;
props[1] = (cl_context_properties)p_max;
props[2] = 0;
//char exts[1000];
//clGetDeviceInfo(data->ocl_device, CL_DEVICE_EXTENSIONS, 1000, exts, NULL);
//cerr << "Extensions: " << exts << endl;
data->ocl_context = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode);
if ( errcode ) {
cerr << "clCreateContextFromType: error " << errcode << endl;
goto RTN_FAIL;
}
data->ocl_queue = clCreateCommandQueue(data->ocl_context, data->ocl_device, 0, &errcode);
if ( errcode ) {
cerr << "clCreateCommandQueue: error " << errcode << endl;
goto RTN_FAIL;
}
data->ocl_program = clCreateProgramWithSource(data->ocl_context, 1, &s_mac_kernels, NULL, &errcode);
if ( errcode ) {
cerr << "clCreateProgramWithSource: error " << errcode << endl;
goto RTN_FAIL;
}
errcode = clBuildProgram(data->ocl_program, 1, &data->ocl_device, 0, 0, 0);
if ( errcode ) {
cerr << "clBuildProgram: error " << errcode << endl;
/* doesn't work:
char build_msg[1000];
clGetProgramBuildInfo(data->ocl_program, data->ocl_device, CL_PROGRAM_BUILD_LOG, 1000, build_msg, 0);
cerr << build_msg << endl;
*/
goto RTN_FAIL;
}
data->ocl_P0_kernel[QUBX_4] = clCreateKernel(data->ocl_program, "mac_P0_kernel_4", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac_P0_kernel_4): error " << errcode << endl;
goto RTN_FAIL;
}
data->ocl_point_kernel[QUBX_4] = clCreateKernel(data->ocl_program, "mac_point_kernel_4", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac_point_kernel_4): error " << errcode << endl;
goto RTN_FAIL;
}
data->ocl_P0_kernel[QUBX_8] = clCreateKernel(data->ocl_program, "mac_P0_kernel_8", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac_P0_kernel_8): error " << errcode << endl;
goto RTN_FAIL;
}
data->ocl_point_kernel[QUBX_8] = clCreateKernel(data->ocl_program, "mac_point_kernel_8", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac_point_kernel_8): error " << errcode << endl;
goto RTN_FAIL;
}
data->ocl_P0_kernel[QUBX_16] = clCreateKernel(data->ocl_program, "mac_P0_kernel_16", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac_P0_kernel_16): error " << errcode << endl;
goto RTN_FAIL;
}
data->ocl_point_kernel[QUBX_16] = clCreateKernel(data->ocl_program, "mac_point_kernel_16", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac_point_kernel_16): error " << errcode << endl;
goto RTN_FAIL;
}
/* double precision is 3x slower on tesla c1060, comparable to quad-core cpu
data->ocl64_program = clCreateProgramWithSource(data->ocl_context, 1, &s_mac64_kernels, NULL, &errcode);
if ( errcode ) {
cerr << "clCreateProgramWithSource(64): error " << errcode << endl;
return 0; // good enough
}
errcode = clBuildProgram(data->ocl64_program, 1, &data->ocl_device, 0, 0, 0);
if ( errcode ) {
cerr << "clBuildProgram(64): error " << errcode << endl;
char build_msg[1000];
clGetProgramBuildInfo(data->ocl64_program, data->ocl_device, CL_PROGRAM_BUILD_LOG, 1000, build_msg, 0);
cerr << build_msg << endl;
return 0; // good enough
}
data->ocl64_P0_kernel[QUBX_4] = clCreateKernel(data->ocl64_program, "mac64_P0_kernel_4", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac64_P0_kernel_4): error " << errcode << endl;
return 0; // good enough
}
data->ocl64_point_kernel[QUBX_4] = clCreateKernel(data->ocl64_program, "mac64_point_kernel_4", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac64_point_kernel_4): error " << errcode << endl;
return 0; // good enough
}
data->ocl64_P0_kernel[QUBX_8] = clCreateKernel(data->ocl64_program, "mac64_P0_kernel_8", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac64_P0_kernel_8): error " << errcode << endl;
return 0; // good enough
}
data->ocl64_point_kernel[QUBX_8] = clCreateKernel(data->ocl64_program, "mac64_point_kernel_8", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac64_point_kernel_8): error " << errcode << endl;
return 0; // good enough
}
data->ocl64_P0_kernel[QUBX_16] = clCreateKernel(data->ocl64_program, "mac64_P0_kernel_16", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac64_P0_kernel_16): error " << errcode << endl;
return 0; // good enough
}
data->ocl64_point_kernel[QUBX_16] = clCreateKernel(data->ocl64_program, "mac64_point_kernel_16", &errcode);
if ( errcode ) {
cerr << "clCreateKernel(mac64_point_kernel_16): error " << errcode << endl;
}
*/
return 0;
RTN_FAIL:
qubx_mac_opencl_cleanup_context(data);
return errcode;
}
void qubx_mac_opencl_cleanup_context(qubx_mac_data *data)
{
for (int i=0; i<3; ++i) {
if ( data->ocl_point_kernel[i] ) {
clReleaseKernel(data->ocl_point_kernel[i]);
data->ocl_point_kernel[i] = NULL;
}
if ( data->ocl_P0_kernel[i] ) {
clReleaseKernel(data->ocl_P0_kernel[i]);
data->ocl_P0_kernel[i] = NULL;
}
if ( data->ocl64_point_kernel[i] ) {
clReleaseKernel(data->ocl64_point_kernel[i]);
data->ocl64_point_kernel[i] = NULL;
}
if ( data->ocl64_P0_kernel[i] ) {
clReleaseKernel(data->ocl64_P0_kernel[i]);
data->ocl64_P0_kernel[i] = NULL;
}
}
if ( data->ocl_program ) {
clReleaseProgram(data->ocl_program);
data->ocl_program = NULL;
}
if ( data->ocl64_program ) {
clReleaseProgram(data->ocl64_program);
data->ocl64_program = NULL;
}
if ( data->ocl_queue ) {
clReleaseCommandQueue(data->ocl_queue);
data->ocl_queue = NULL;
}
if ( data->ocl_context ) {
clReleaseContext(data->ocl_context);
data->ocl_context = NULL;
}
}
int qubx_mac_opencl_setup_workslices(qubx_mac_data *data)
{
int chunk = SMALLEST_CHUNK;
int count;
int total_pieces;
int errcode;
do {
count = 0;
total_pieces = 0;
for (size_t s=0; s<data->segments.size(); ++s) {
for (size_t p=0; p<data->segments[s].pieces.size(); ++p) {
if ( data->segments[s].pieces[p].data ) {
int pcount = data->segments[s].pieces[p].count;
count += (pcount / chunk) + ((pcount % chunk) ? 1 : 0);
++total_pieces;
}
}
}
if ( (count == total_pieces) || (count < (int) (OVERLOADING*data->ocl_compunits)) )
break;
else
chunk *= 2;
} while ( 1 );
std::vector<qubx_mac_workslice> slices;
for (size_t s=0; s<data->segments.size(); ++s) {
for (size_t p=0; p<data->segments[s].pieces.size(); ++p) {
qubx_mac_data_piece& piece = data->segments[s].pieces[p];
if ( piece.data ) {
int piece_off = (int) (piece.data - data->data);
for (int offset=0; offset<piece.count; offset += chunk) {
slices.push_back( qubx_mac_workslice((int)s, (int)p, piece_off + offset, min(chunk, piece.count - offset)) );
}
}
}
}
data->ocl_workslices = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
slices.size() * sizeof(qubx_mac_workslice), & slices[0], &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_workslices): error " << errcode << endl;
return errcode;
}
data->ocl_slicecount = (int) slices.size();
//cerr << "slice count: " << data->ocl_slicecount << endl;
//cerr << "chunk size: " << chunk << endl;
return 0;
}
int qubx_mac_opencl_setup_data(qubx_mac_data *data)
{
if ( data->ocl_workslices )
return 0; // already set up
int errcode;
int transfer_bytes = (int) (data->segments.size() * data->maxNpiece * 2 * sizeof(float));
data->transfer = new float[(transfer_bytes / sizeof(float)) + ((transfer_bytes % sizeof(float)) ? 1 : 0)];
float *fbuf = data->transfer;
int *ibuf = (int *) fbuf;
for (size_t s=0; s<data->segments.size(); ++s)
ibuf[s] = (int) data->segments[s].pieces.size();
data->ocl_seg_npiece = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
data->segments.size()*sizeof(int), ibuf, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_seg_npiece): error " << errcode << endl;
return errcode;
}
for (size_t s=0; s<data->segments.size(); ++s) {
float *fbuf_s = fbuf + s*data->maxNpiece;
for (size_t p=0; p<data->segments[s].pieces.size(); ++p) {
fbuf_s[p] = (float) data->segments[s].pieces[p].dur;
}
}
data->ocl_seg_piece_dur = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
data->segments.size()*data->maxNpiece*sizeof(float), fbuf, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_seg_piece_dur): error " << errcode << endl;
return errcode;
}
for (size_t s=0; s<data->segments.size(); ++s) {
int *ibuf_s = ibuf + s*data->maxNpiece;
for (size_t p=0; p<data->segments[s].pieces.size(); ++p) {
ibuf_s[p] = data->segments[s].pieces[p].stimclass;
}
}
data->ocl_seg_piece_sc = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
data->segments.size()*data->maxNpiece*sizeof(int), ibuf, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_seg_piece_sc): error " << errcode << endl;
return errcode;
}
data->ocl_seg_piece_P0 = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE,
data->segments.size()*data->maxNpiece*QUBX_OCL_MAX_NSTATE*sizeof(float), NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_seg_piece_P0): error " << errcode << endl;
return errcode;
}
data->ocl_seg_piece_dV = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE,
data->segments.size()*data->maxNpiece*sizeof(float), NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_seg_piece_dV): error " << errcode << endl;
return errcode;
}
data->ocl_seg_piece_Ileak = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE,
data->segments.size()*data->maxNpiece*sizeof(float), NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_seg_piece_Ileak): error " << errcode << endl;
return errcode;
}
data->ocl_samples = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
data->Ndata*sizeof(qubx_mac_sample), data->data, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_samples): error " << errcode << endl;
return errcode;
}
data->ocl_samples_out = clCreateBuffer(data->ocl_context, CL_MEM_WRITE_ONLY,
data->Ndata*sizeof(qubx_mac_sample_out), NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_samples_out): error " << errcode << endl;
return errcode;
}
data->ocl_dbg = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE,
QUBX_OCL_MAX_NSTATE*QUBX_OCL_MAX_NSTATE*sizeof(float), NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_dbg): error " << errcode << endl;
return errcode;
}
return qubx_mac_opencl_setup_workslices(data);
}
void qubx_mac_opencl_cleanup_data(qubx_mac_data *data)
{
clReleaseMemObject(data->ocl_workslices);
data->ocl_workslices = NULL;
clReleaseMemObject(data->ocl_dbg);
data->ocl_dbg = NULL;
clReleaseMemObject(data->ocl_samples_out);
data->ocl_samples_out = NULL;
clReleaseMemObject(data->ocl_samples);
data->ocl_samples = NULL;
clReleaseMemObject(data->ocl_seg_piece_P0);
data->ocl_seg_piece_P0 = NULL;
clReleaseMemObject(data->ocl_seg_piece_dV);
data->ocl_seg_piece_dV = NULL;
clReleaseMemObject(data->ocl_seg_piece_Ileak);
data->ocl_seg_piece_Ileak = NULL;
clReleaseMemObject(data->ocl_seg_piece_sc);
data->ocl_seg_piece_sc = NULL;
clReleaseMemObject(data->ocl_seg_piece_dur);
data->ocl_seg_piece_dur = NULL;
clReleaseMemObject(data->ocl_seg_npiece);
data->ocl_seg_npiece = NULL;
delete [] data->transfer;
data->sent_v = 0;
}
int qubx_mac_opencl_setup_model(qubx_mac_param *p, qubx_mac_data *data)
{
int errcode;
qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage;
qubx_model *model = st->models[0].model;
qubx_stim_rates *ratesm = st->models[0].ratesm;
int dim = st->ocl_dim;
if ( ! st->ocl_xvar ) {
int transfer_bytes = max((int) (QUBX_OCL_MAX_NSTATE * QUBX_OCL_MAX_NSTATE * 2 * data->Nstim * sizeof(float)),
(int) (data->segments.size() * data->maxNpiece * sizeof(float)));
st->transfer = new float[(transfer_bytes / sizeof(float)) + ((transfer_bytes % sizeof(float)) ? 1 : 0)];
st->ocl_QQ_eig = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY,
dim * data->Nstim * sizeof(float),
NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_QQ_eig): error " << errcode << endl;
return errcode;
}
st->ocl_QQ_evec = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY,
dim * dim * data->Nstim * sizeof(float),
NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_QQ_evec): error " << errcode << endl;
return errcode;
}
st->ocl_QQ_einv = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY,
dim * dim * data->Nstim * sizeof(float),
NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_QQ_einv): error " << errcode << endl;
return errcode;
}
st->ocl_xamp = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY,
dim * sizeof(float), NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_xamp): error " << errcode << endl;
return errcode;
}
st->ocl_xvar = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY,
dim * sizeof(float), NULL, &errcode);
if ( errcode ) {
cerr << "clCreateBuffer(ocl_xvar): error " << errcode << endl;
return errcode;
}
}
float *fbuf = st->transfer;
float *fbuf_row;
fbuf_row = fbuf;
for (int sc=0; sc<data->Nstim; ++sc, fbuf_row += dim) {
int i = 0;
for (; i<model->Nstate; ++i)
fbuf_row[i] = (float) ratesm->QQ_eig[sc][i];
for (; i<dim; ++i)
fbuf_row[i] = 0.0f;
}
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_QQ_eig, CL_TRUE, 0,
data->Nstim*dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_QQ_eig): error " << errcode << endl;
return errcode;
}
fbuf_row = fbuf;
for (int sc=0; sc<data->Nstim; ++sc) {
int r = 0;
for (; r<model->Nstate; ++r, fbuf_row += dim) {
int c = 0;
for (; c<model->Nstate; ++c)
fbuf_row[c] = (float) ratesm->QQ_evec[sc][r][c];
for (; c<dim; ++c)
fbuf_row[c] = 0.0f;
}
for (; r<dim; ++r, fbuf_row += dim)
memset(fbuf_row, 0, dim*sizeof(float));
}
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_QQ_evec, CL_FALSE, 0,
data->Nstim*dim*dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_QQ_evec): error " << errcode << endl;
return errcode;
}
// fbuf_row = fbuf;
for (int sc=0; sc<data->Nstim; ++sc) {
int r = 0;
for (; r<model->Nstate; ++r, fbuf_row += dim) {
int c = 0;
for (; c<model->Nstate; ++c)
fbuf_row[c] = (float) ratesm->QQ_einv[sc][r][c];
for (; c<dim; ++c)
fbuf_row[c] = 0.0f;
}
for (; r<dim; ++r, fbuf_row += dim)
memset(fbuf_row, 0, dim*sizeof(float));
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_QQ_einv, CL_TRUE, 0,
data->Nstim*dim*dim*sizeof(float), fbuf+data->Nstim*dim*dim, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_QQ_einv): error " << errcode << endl;
return errcode;
}
}
memset(fbuf, 0, 16*sizeof(float));
for (int i=0; i<model->Nstate; ++i) {
fbuf[i] = (float) st->models[0].xamp.p[i];
fbuf[dim+i] = (float) st->models[0].xvar.p[i];
}
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_xamp, CL_FALSE, 0,
dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_xamp): error " << errcode << endl;
return errcode;
}
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_xvar, CL_TRUE, 0,
dim*sizeof(float), fbuf+dim, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_xvar): error " << errcode << endl;
return errcode;
}
st->ocl_amp0 = (float) model->amp0;
st->ocl_var0 = (float) model->var0;
return 0;
}
void qubx_mac_opencl_cleanup_model(qubx_mac_param_storage *st)
{
clReleaseMemObject(st->ocl_xvar);
st->ocl_xvar = NULL;
clReleaseMemObject(st->ocl_xamp);
st->ocl_xamp = NULL;
clReleaseMemObject(st->ocl_QQ_eig);
st->ocl_QQ_eig = NULL;
clReleaseMemObject(st->ocl_QQ_evec);
st->ocl_QQ_evec = NULL;
clReleaseMemObject(st->ocl_QQ_einv);
st->ocl_QQ_einv = NULL;
delete [] st->transfer;
}
int qubx_mac_opencl_setup_data_V(qubx_mac_data *data, qubx_mac_param *p)
{
if ( data->sent_v )
return 0;
qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage;
qubx_model *model = st->models[0].model;
qubx_stim_amps *ampm = st->models[0].ampm;
float Vrev = (float) (1e-3 * model->Vrev);
float Vleak = (float) (1e-3 * model->Vleak);
float Gleak = (float) model->Gleak;
float *fbuf = data->transfer;
float *fbuf2 = data->transfer + data->segments.size() * data->maxNpiece;
int errcode = 0;
if ( model->IeqFv ) {
for (size_t s=0; s<data->segments.size(); ++s) {
float *fbuf_s = fbuf + s * data->maxNpiece;
float *fbuf2_s = fbuf2 + s * data->maxNpiece;
for (size_t ip=0; ip<data->segments[s].pieces.size(); ++ip) {
float V = (float) (1e-3 * ampm->stimclasses[data->segments[s].pieces[ip].stimclass * ampm->Nsig + model->iVoltage]);
fbuf_s[ip] = V - Vrev;
fbuf2_s[ip] = (V - Vleak) * Gleak;
}
}
} else {
memset(fbuf, 0, data->segments.size() * data->maxNpiece * 2 * sizeof(float));
for (size_t s=0; s<data->segments.size(); ++s) {
float *fbuf_s = fbuf + s * data->maxNpiece;
for (size_t ip=0; ip<data->segments[s].pieces.size(); ++ip)
fbuf_s[ip] = 1.0;
}
}
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, data->ocl_seg_piece_dV, CL_FALSE, 0,
data->segments.size() * data->maxNpiece * sizeof(float),
fbuf, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_seg_piece_dV): error " << errcode << endl;
return errcode;
}
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, data->ocl_seg_piece_Ileak, CL_TRUE, 0,
data->segments.size() * data->maxNpiece * sizeof(float),
fbuf2, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_seg_piece_Ileak): error " << errcode << endl;
return errcode;
}
data->sent_v = 1;
return 0;
}
int qubx_mac_opencl_setup_P0(qubx_mac_data *data, qubx_mac_param *p)
{
int errcode;
qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage;
qubx_model *model = st->models[0].model;
qubx_stim_rates *ratesm = st->models[0].ratesm;
qubx_stim_rates_storage *ratesm_st = (qubx_stim_rates_storage *) ratesm->storage;
matrix<double> P0(1, model->Nstate);
int dim = st->ocl_dim;
// each seg: find P0 (equil?), put in model->transfer, write to data->ocl_seg_piece_P0 +s*maxNpiece*dim*float
if ( ! model->usePeq ) {
memset(st->transfer, 0, dim*sizeof(float));
for (int i=0; i<model->Nstate; ++i)
st->transfer[i] = (float) model->P0[i];
}
for (size_t s=0; s<data->segments.size(); ++s) {
if ( model->usePeq ) {
memset(st->transfer, 0, dim*sizeof(float));
QtoPe(ratesm_st->QQ[data->segments[s].pieces[0].stimclass].m, P0);
for (int i=0; i<model->Nstate; ++i)
st->transfer[i] = (float) P0(0,i);
}
if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, data->ocl_seg_piece_P0, CL_TRUE, s*data->maxNpiece*dim*sizeof(float),
dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) {
cerr << "clEnqueueWriteBuffer(ocl_seg_piece_P0): error " << errcode << endl;
return errcode;
}
}
return 0;
}
#endif
// TODO: video, email