max_mac_ll.cpp.html | mathcode2html |
Source file: max_mac_ll.cpp | |
Converted: Fri Feb 3 2017 at 15:42:51 | |
This documentation file will not reflect any later changes in the source file. |
$$\phantom{******** If you see this on the webpage then the
browser could not locate *********}$$
$$\phantom{******** jsMath/easy/load.js or the variable root
is set wrong in this file *********}$$
$$\newcommand{\vector}[1]{\left[\begin{array}{c} #1 \end{array}\right]}$$
$$\newenvironment{matrix}{\left[\begin{array}{cccccccccc}} {\end{array}\right]}$$
$$\newcommand{\A}{{\cal A}}$$
$$\newcommand{\W}{{\cal W}}$$
/* Copyright 2008-2016 Research Foundation State University of New York */ /* This file is part of QUB Express. */ /* QUB Express is free software; you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation, either version 3 of the License, or */ /* (at your option) any later version. */ /* QUB Express is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* You should have received a copy of the GNU General Public License, */ /* named LICENSE.txt, in the QUB Express program directory. If not, see */ /* <http://www.gnu.org/licenses/>. */ /*
|
See also: Up: Index |
*/ #ifdef _WIN32 #include <windows.h> #else #include <stdlib.h> #define BOOL int #define TRUE 1 #define FALSE 0 #endif #include <string.h> #include <iostream> #include <fstream> #define _USE_MATH_DEFINES #include <math.h> #include <string.h> #include <set> #include <map> #include <vector> #include <algorithm> #ifdef QUBX_MAC_OPENCL #ifdef __APPLE__ // mac os x: #include <OpenCL/opencl.h> #else // linux/nvidia: #include <CL/cl.h> #endif #endif #include "max_mac_ll.h" #include "../qubfast/max_ll_util.h" #include "../qubfast/qubx_model_storage.h" #include "../qubfast/ublas_matrixutil.h" #include "../qubfast/callbk_reportstream.h" #include "../qubfast/qub_constraints.h" using std::cout; using std::cerr; using std::endl; using std::set; using std::map; using std::pair; using std::copy; using std::min; using std::max; class qubx_mac_data; class qubx_mac_param_storage; #ifdef QUBX_MAC_OPENCL #include "max_mac_ll.opencl" //#include "max_mac_ll_64.opencl" #define SMALLEST_CHUNK 8 #define OVERLOADING 16 #define QUBX_OCL_MAX_NSTATE 16 #define QUBX_OCL_MIN_NDATA 256 #define QUBX_4 0 #define QUBX_8 1 #define QUBX_16 2 int qubx_mac_opencl_setup_context(qubx_mac_data *data); void qubx_mac_opencl_cleanup_context(qubx_mac_data *data); int qubx_mac_opencl_setup_data(qubx_mac_data *data); void qubx_mac_opencl_cleanup_data(qubx_mac_data *data); int qubx_mac_opencl_setup_model(qubx_mac_param *p, qubx_mac_data *data); void qubx_mac_opencl_cleanup_model(qubx_mac_param_storage *st); int qubx_mac_opencl_setup_data_V(qubx_mac_data *data, qubx_mac_param *p); int qubx_mac_opencl_setup_P0(qubx_mac_data *data, qubx_mac_param *p); int qubx_mac_ll_opencl(qubx_mac_param *p); #endif int qubx_mac_ll_basic(qubx_mac_param *p); int qubx_mac_ll_openmp(qubx_mac_param *p); #ifdef _WIN32 #include "../qubfast/gettimeofday.h" #else #include <sys/time.h> #endif void print_timestamp(MAXMLL_VAR_NOT_USED const char *lbl) { //struct timeval tm; //gettimeofday(&tm, NULL); //cerr << "--- " << tm.tv_sec << " : " << (tm.tv_usec / 1000) << "." << (tm.tv_usec % 1000) << " --- " << lbl << endl; } typedef map<std::vector<double>, int> stimmap; void mplex_idl(int Nsignal, int *dwellCounts, int **classeses, float **durationses, double **ampses, int *out_dwellCount, int *out_classes, float *out_durations, stimmap& stimcls_of) { if ( ! Nsignal ) return; // the key to stimmap std::vector<double> amps(1+Nsignal); // output int &Nd = *out_dwellCount; Nd = 0; // per-signal stuff std::vector<int> finger(Nsignal, 0); std::vector<int> clss(Nsignal); std::vector<float> remain(Nsignal); for (int i=0; i<Nsignal; ++i) { if ( ! dwellCounts[i] ) return; clss[i] = classeses[i][0]; remain[i] = durationses[i][0]; if ( clss[i] >= 0 ) amps[i+1] = ampses[i][ clss[i] ]; else amps[i+1] = 0.0; // gap at beginning of stimulus: should complain and fail; instead silently defaulting to zero } while ( 1 ) { // until any signal ends bool done = false; for (int i=0; i<Nsignal; ++i) if ( finger[i] >= dwellCounts[i] ) done = true; if ( done ) break; // find signal with shortest remaining float tm = remain[0]; //int changing = 0; for (int i=0; i<Nsignal; ++i) { if ( remain[i] < tm ) { tm = remain[i]; //changing = i; } } // look up stimclass stimmap::iterator sti = stimcls_of.find(amps); if ( sti == stimcls_of.end() ) { int sc = (int) stimcls_of.size(); stimcls_of[amps] = sc; } // append the event out_classes[Nd] = stimcls_of[amps]; out_durations[Nd] = tm; ++Nd; // shorten all signals remain, and move the finger for any that are 0 for (int i=0; i<Nsignal; ++i) { remain[i] -= tm; if ( remain[i] <= 0.0 ) { if ( (++finger[i]) < dwellCounts[i] ) { remain[i] = durationses[i][ finger[i] ]; clss[i] = classeses[i][ finger[i] ]; if ( clss[i] >= 0 ) amps[i+1] = ampses[i][ clss[i] ]; // else: gap: keep the last event's amp } } } } } class qubx_mac_data_piece { public: int stimclass; qubx_mac_sample *data; // pointer into mac_data->data, or NULL if count == 0 qubx_mac_sample_out *data_out; // pointer into mac_data->data_out, or NULL if count == 0 int count; double dur; qubx_mac_data_piece(int stimclass_, qubx_mac_sample *data_, qubx_mac_sample_out *data_out_, int count_, double dur_) : stimclass(stimclass_), data(data_), data_out(data_out_), count(count_), dur(dur_) {} }; class qubx_mac_data_seg { public: std::vector<qubx_mac_data_piece> pieces; double dur; qubx_mac_data_seg() : dur(0.0) {} }; class qubx_mac_data { public: int Nstim, Nsig, Ndata, data_cap, maxNpiece; qubx_mac_sample *data; qubx_mac_sample_out *data_out; std::vector<qubx_mac_data_seg> segments; std::vector<double> stimclasses; stimmap stimcls_of; std::vector<double> sc_frac; #ifdef QUBX_MAC_OPENCL cl_context ocl_context; cl_device_id ocl_device; cl_uint ocl_compunits; cl_command_queue ocl_queue; cl_program ocl_program, ocl64_program; cl_kernel ocl_P0_kernel[3], ocl_point_kernel[3], ocl64_P0_kernel[3], ocl64_point_kernel[3]; cl_mem ocl_seg_npiece, ocl_seg_piece_dur, ocl_seg_piece_sc; cl_mem ocl_samples, ocl_samples_out, ocl_fit_var_ll, ocl_workslices; cl_mem ocl_seg_piece_P0, ocl_seg_piece_dV, ocl_seg_piece_Ileak; int ocl_slicecount; float *transfer; int sent_v; cl_mem ocl_dbg; #endif qubx_mac_data(int Nstimsig_, int data_cap_) : Nstim(0), Nsig(Nstimsig_+1), Ndata(0), data_cap(data_cap_), maxNpiece(0) , data(new qubx_mac_sample[data_cap_]), data_out(new qubx_mac_sample_out[data_cap_]) #ifdef QUBX_MAC_OPENCL , ocl_context(NULL), ocl_device(0), ocl_compunits(0), ocl_queue(NULL), ocl_program(NULL), ocl64_program(NULL) //, ocl_P0_kernel({NULL, NULL, NULL}), ocl_point_kernel({NULL, NULL, NULL}) //, ocl64_P0_kernel({NULL, NULL, NULL}), ocl64_point_kernel({NULL, NULL, NULL}) , ocl_seg_npiece(NULL), ocl_seg_piece_dur(NULL), ocl_seg_piece_sc(NULL) , ocl_samples(NULL), ocl_samples_out(NULL), ocl_fit_var_ll(NULL), ocl_workslices(NULL) , ocl_seg_piece_P0(NULL), ocl_seg_piece_dV(NULL), ocl_seg_piece_Ileak(NULL) , ocl_slicecount(0), transfer(NULL), sent_v(0), ocl_dbg(NULL) { for (int i=0; i<3; ++i) ocl_P0_kernel[i] = ocl_point_kernel[i] = ocl64_P0_kernel[i] = ocl64_point_kernel[i] = NULL; } #else {} #endif virtual ~qubx_mac_data() { delete [] data; delete [] data_out; #ifdef QUBX_MAC_OPENCL qubx_mac_opencl_cleanup_data(this); qubx_mac_opencl_cleanup_context(this); #endif } void reset(int Nstimsig, int newcap) { Nstim = Ndata = maxNpiece = 0; Nsig = Nstimsig + 1; segments.clear(); stimclasses.clear(); stimcls_of.clear(); ensure_capacity(newcap); #ifdef QUBX_MAC_OPENCL qubx_mac_opencl_cleanup_data(this); #endif } void ensure_capacity(int newcap) { if ( data_cap < newcap ) { while ( data_cap < newcap ) data_cap *= 2; qubx_mac_sample *newdata = new qubx_mac_sample[data_cap]; qubx_mac_sample_out *newdata_out = new qubx_mac_sample_out[data_cap]; memcpy(newdata, data, Ndata*sizeof(qubx_mac_sample)); memcpy(newdata_out, data_out, Ndata*sizeof(qubx_mac_sample_out)); for (std::vector<qubx_mac_data_seg>::iterator segi=segments.begin(); segi!=segments.end(); ++segi) { for (std::vector<qubx_mac_data_piece>::iterator piece = segi->pieces.begin(); piece != segi->pieces.end(); ++piece) { if ( piece->data ) { piece->data = newdata + (piece->data - data); piece->data_out = newdata_out + (piece->data_out - data_out); } } } delete [] data; delete [] data_out; data = newdata; data_out = newdata_out; } } int append_segment() { int ix = (int) segments.size(); segments.resize(ix+1); return ix; } void append_chunk(int seg_ix, double dur_sec, int Nd, float *T, float *I, float *V, int *dwellCounts, int **classeses, float **durationses, double **ampses) { ensure_capacity(Ndata + Nd); qubx_mac_sample *di = data + Ndata; for (int i=0; i<Nd; ++i, ++di) { di->time = T[i]; di->cur = I[i]; di->var = V ? V[i] : 0.0f; } qubx_mac_sample *chunk_data = Nd ? (data + Ndata) : NULL; qubx_mac_sample_out *chunk_data_out = Nd ? (data_out + Ndata) : NULL; qubx_mac_sample_out *dio = chunk_data_out; Ndata += Nd; if ( Nsig < 2 ) { segments[seg_ix].pieces.push_back(qubx_mac_data_piece(0, chunk_data, chunk_data_out, Nd, dur_sec)); maxNpiece = max(maxNpiece, (int) segments[seg_ix].pieces.size()); return; } int max_mplex = 0; for (int i=0; i<(Nsig-1); ++i) max_mplex += dwellCounts[i]; std::vector<int> mplex_clss(max_mplex); std::vector<float> mplex_durs(max_mplex); int mplex_count = 0; mplex_idl(Nsig-1, dwellCounts, classeses, durationses, ampses, &mplex_count, & mplex_clss[0], & mplex_durs[0], stimcls_of); // adjust sample times to be relative to last stimulus transition: di = chunk_data; qubx_mac_sample *di_end = data + Ndata; double& seg_dur = segments[seg_ix].dur; for (int i=0; i<mplex_count; ++i) { float prev_dur = (float) seg_dur; seg_dur += 1e-3 * mplex_durs[i]; qubx_mac_sample *piece_data = di; qubx_mac_sample_out *piece_out = dio; while ( di && (di != di_end) && (di->time < seg_dur) ) { di->time -= prev_dur; ++di; ++dio; } segments[seg_ix].pieces.push_back(qubx_mac_data_piece(mplex_clss[i], (di != piece_data) ? piece_data : NULL, (dio != piece_out) ? piece_out : NULL, (int) (di - piece_data), 1e-3 * mplex_durs[i])); } maxNpiece = max(maxNpiece, (int) segments[seg_ix].pieces.size()); } double* get_stimclasses() { Nstim = (int) stimcls_of.size(); if ( ! Nstim ) Nstim = 1; stimclasses.resize(Nstim*Nsig); for (stimmap::iterator stimi = stimcls_of.begin(); stimi != stimcls_of.end(); ++stimi) { int i = stimi->second * Nsig; copy(stimi->first.begin(), stimi->first.end(), & stimclasses[i]); } sc_frac.resize(Nstim); memset(& sc_frac[0], 0, Nstim*sizeof(double)); int total = 0; for (std::vector<qubx_mac_data_seg>::iterator segi=segments.begin(); segi!=segments.end(); ++segi) { for (std::vector<qubx_mac_data_piece>::iterator piece = segi->pieces.begin(); piece != segi->pieces.end(); ++piece) { if ( piece->data ) { total += piece->count; sc_frac[piece->stimclass] += piece->count; } } } if ( total ) for (int i=0; i<Nstim; ++i) sc_frac[i] /= total; return & stimclasses[0]; } }; extern "C" MAXMLL_API void* __stdcall qubx_mac_data_create(int Nstimsig, int data_cap) { return new qubx_mac_data(Nstimsig, data_cap); } extern "C" MAXMLL_API void __stdcall qubx_mac_data_free(void *data) { delete (qubx_mac_data *) data; } MAXMLL_API void __stdcall qubx_mac_data_reset(void *data, int Nstimsig, int data_cap) { ((qubx_mac_data *) data)->reset(Nstimsig, data_cap); } extern "C" MAXMLL_API int __stdcall qubx_mac_data_append_segment(void *data) { return ((qubx_mac_data *) data)->append_segment(); } extern "C" MAXMLL_API void __stdcall qubx_mac_data_append_chunk(void *data, int seg_ix, double dur_sec, int Nd, float *T, float *I, float *V, int *dwellCounts, int **classeses, float **durationses, double **ampses) { ((qubx_mac_data *) data)->append_chunk(seg_ix, dur_sec, Nd, T, I, V, dwellCounts, classeses, durationses, ampses); } extern "C" MAXMLL_API double* __stdcall qubx_mac_data_get_stimclasses(void *data, int *Nstimclass) // returns double[Nstimclass * Nsignal], Nsignal = Nstimsig + 1; { qubx_mac_data *macdata = (qubx_mac_data *) data; double *rtn = macdata->get_stimclasses(); *Nstimclass = macdata->Nstim; return rtn; } extern "C" MAXMLL_API double* __stdcall qubx_mac_data_get_stimclass_frac(void *data) { qubx_mac_data *macdata = (qubx_mac_data *) data; return & macdata->sc_frac[0]; } extern "C" MAXMLL_API qubx_mac_sample_out* __stdcall qubx_mac_data_get_buffer(void *data) { qubx_mac_data *macdata = (qubx_mac_data *) data; return macdata->data_out; } void copy_sub_hess(double *sub, int N, double *full, int x0, int y0, int xn, int yn) { for (int i=0; i<xn; ++i) { for (int j=0; j<yn; ++j) { *(sub++) = full[(x0+i)*N + y0+j]; } } } class qubx_mac_param_model { public: int Npar, NfPar; qubx_model *model; qubx_stim_amps *ampm; qubx_stim_rates *ratesm; pointy_col_vector<double> xamp, xvar; double scale_Nchannel; double float_Nchannel; pointy_col_vector<double> pars, fPars; void init(qubx_model *model_, qubx_stim_amps *ampm_, qubx_stim_rates *ratesm_) { model = model_; ampm = ampm_; ratesm = ratesm_; xamp.resize(model_->Nstate); xvar.resize(model_->Nstate); scale_Nchannel = 1.0; float_Nchannel = model->Nchannel; scale_Nchannel = log(float_Nchannel); } void setup_xampvar() { if ( model->IeqFv ) { for (int s=model->Nstate-1; s>=0; --s) { xvar.p[s] = model->var[model->clazz[s]]; xamp.p[s] = model->amp[model->clazz[s]]; } } else { for (int s=model->Nstate-1; s>=0; --s) { xvar.p[s] = max(0.0, model->var[model->clazz[s]] - model->var[0]); xamp.p[s] = model->amp[model->clazz[s]] - model->amp[0]; } } } void setup_constraints(int optNchannel) { if ( optNchannel ) optNchannel = 1; qubx_constrained_setup(ampm->cns); qubx_stim_rates_setup_constraints(ratesm); Npar = ampm->cns->Npar + ratesm->cns->Npar + optNchannel; NfPar = ampm->cns->NfPar + ratesm->cns->NfPar + optNchannel; pars.resize(Npar); memcpy(pars.p, ratesm->cns->pars, ratesm->cns->Npar*sizeof(double)); fPars.resize(NfPar); memcpy(fPars.p, ratesm->cns->fPars, ratesm->cns->NfPar*sizeof(double)); memcpy(pars.p+ratesm->cns->Npar, ampm->cns->pars, ampm->cns->Npar*sizeof(double)); memcpy(fPars.p+ratesm->cns->NfPar, ampm->cns->fPars, ampm->cns->NfPar*sizeof(double)); if ( optNchannel ) { pars.p[Npar-1] = fPars.p[NfPar-1] = 1.0; } float_Nchannel = model->Nchannel; scale_Nchannel = log(float_Nchannel); } void setup_nchannel() { float_Nchannel = model->Nchannel; scale_Nchannel = log(float_Nchannel); } void do_fPars_to_model(int optNchannel) { if ( ratesm->cns->NfPar ) { memcpy(ratesm->cns->fPars, fPars.p, ratesm->cns->NfPar*sizeof(double)); qubx_constrained_free_to_pars(ratesm->cns); qubx_stim_rates_pars_to_model(ratesm); } if ( ampm->cns->NfPar ) { memcpy(ampm->cns->fPars, fPars.p+ratesm->cns->NfPar, ampm->cns->NfPar*sizeof(double)); qubx_constrained_free_to_pars(ampm->cns); qubx_stim_amps_pars_to_model(ampm); } if ( optNchannel ) { float_Nchannel = max(1.0, exp(scale_Nchannel * fPars.p[NfPar-1])); model->Nchannel = (int) (float_Nchannel + 0.5); } } void calc_std(double *InvHessian, double *pars_all, int NfPar_all, int iPar, int ifPar, int optNchannel) { std::vector<double> subhess(NfPar * NfPar); if ( ratesm->cns->NfPar ) { copy_sub_hess(&subhess[0], NfPar_all, InvHessian, ifPar, ifPar, ratesm->cns->NfPar, ratesm->cns->NfPar); qubx_stim_rates_calc_std(ratesm, &subhess[0]); } if ( ampm->cns->NfPar ) { copy_sub_hess(&subhess[0], NfPar_all, InvHessian, ifPar+ratesm->cns->NfPar, ifPar+ratesm->cns->NfPar, ampm->cns->NfPar, ampm->cns->NfPar); qubx_stim_amps_calc_std(ampm, &subhess[0]); } if ( optNchannel ) { model->d_Nchannel = sqrt(scale_Nchannel * scale_Nchannel * InvHessian[(NfPar_all+1)*(ifPar+NfPar-1)]) * exp(pars_all[iPar+Npar-1]); } } }; class qubx_mac_param_storage { public: std::vector<qubx_mac_param_model> models; int &Npar, &NfPar; // same as param->Npar, param->NfPar pointy_col_vector<double> pars, fPars; int iModel; #ifdef QUBX_MAC_OPENCL float *transfer; cl_mem ocl_QQ_eig, ocl_QQ_evec, ocl_QQ_einv, ocl_xamp, ocl_xvar; float ocl_amp0, ocl_var0; int ocl_dim; #endif qubx_mac_param_storage(int Nmodel, int& Npar_, int& NfPar_) : models(Nmodel), Npar(Npar_), NfPar(NfPar_), iModel(0) #ifdef QUBX_MAC_OPENCL , transfer(NULL), ocl_QQ_eig(NULL), ocl_QQ_evec(NULL), ocl_QQ_einv(NULL), ocl_xamp(NULL), ocl_xvar(NULL) #endif {} #ifdef QUBX_MAC_OPENCL virtual ~qubx_mac_param_storage() { qubx_mac_opencl_cleanup_model(this); } #endif void add_model(qubx_model *model, qubx_stim_amps *ampm, qubx_stim_rates *ratesm) { models[iModel++].init(model, ampm, ratesm); #ifdef QUBX_MAC_OPENCL if ( models.size() == 1 ) { if ( model->Nstate <= 4 ) ocl_dim = 4; else if ( model->Nstate <= 8 ) ocl_dim = 8; else ocl_dim = 16; } #endif } void setup_xampvar() { for (int i=(int)models.size()-1; i>=0; --i) models[i].setup_xampvar(); } int setup_constraints(qubx_mac_param *p) { setup_xampvar(); Npar = NfPar = 0; for (int i=(int)models.size()-1; i>=0; --i) { models[i].setup_constraints(p->optNchannel); Npar += models[i].Npar; NfPar += models[i].NfPar; } pars.resize(Npar); fPars.resize(NfPar); p->pars = pars.p; p->fPars = fPars.p; int iPar = 0; int ifPar = 0; for (size_t i=0; i<models.size(); ++i) { memcpy(pars.p+iPar, models[i].pars.p, models[i].Npar*sizeof(double)); memcpy(fPars.p+ifPar, models[i].fPars.p, models[i].NfPar*sizeof(double)); iPar += models[i].Npar; ifPar += models[i].NfPar; } return 0; } void setup_nchannel() { for (size_t i=0; i<models.size(); ++i) { models[i].setup_nchannel(); } } void do_fPars_to_model(int optNchannel) { int ifPar = 0; for (size_t i=0; i<models.size(); ++i) { memcpy(models[i].fPars.p, fPars.p+ifPar, models[i].NfPar*sizeof(double)); models[i].do_fPars_to_model(optNchannel); ifPar += models[i].NfPar; } setup_xampvar(); } void calc_std(qubx_mac_param *p, double *InvHessian) { int iPar = 0; int ifPar = 0; for (size_t i=0; i<models.size(); ++i) { models[i].calc_std(InvHessian, pars.p, p->NfPar, iPar, ifPar, p->optNchannel); iPar += models[i].Npar; ifPar += models[i].NfPar; } } }; extern "C" MAXMLL_API qubx_mac_param* __stdcall qubx_mac_param_create(int Nmodel) { qubx_mac_param *p = new qubx_mac_param; qubx_mac_param_storage *st = new qubx_mac_param_storage(Nmodel, p->Npar, p->NfPar); p->storage = st; p->useVar = 1; p->optNchannel = 1; p->accel = 10; p->Npar = p->NfPar = 0; p->ll = 0.0; p->gpu_fail = 0; p->float_Nchannel = 0.0; return p; } extern "C" MAXMLL_API void __stdcall qubx_mac_param_free(qubx_mac_param *p) { delete (qubx_mac_param_storage*) p->storage; delete p; } extern "C" MAXMLL_API void __stdcall qubx_mac_param_add_model(qubx_mac_param *p, qubx_model *model, qubx_stim_amps *ampm, qubx_stim_rates *ratesm) { qubx_mac_param_storage *st = (qubx_mac_param_storage*) p->storage; st->add_model(model, ampm, ratesm); } extern "C" MAXMLL_API int __stdcall qubx_mac_setup_constraints(qubx_mac_param *p) { qubx_mac_param_storage *st = (qubx_mac_param_storage*) p->storage; return st->setup_constraints(p); } extern "C" MAXMLL_API void __stdcall qubx_mac_do_fPars_to_model(qubx_mac_param *p) { qubx_mac_param_storage *ps = (qubx_mac_param_storage*) p->storage; ps->do_fPars_to_model(p->optNchannel); p->float_Nchannel = 0.0; // meaning: use the individual models' float_Nchannel unless it gets changed } extern "C" MAXMLL_API void __stdcall qubx_mac_calc_std(qubx_mac_param *p, double *InvHessian) { qubx_mac_param_storage *ps = (qubx_mac_param_storage *) p->storage; ps->calc_std(p, InvHessian); } class qubx_mac_workspace { public: qubx_mac_param *p; qubx_mac_param_model *pm; qubx_mac_param_storage *st; qubx_model *m; qubx_stim_rates_storage *rst; matrix<double> A; matrix<double> P0, Pt; matrix<double> varPt; matrix<double> tmp_m, tmp_v, tmp_s; matrix<double> tmp_diag; int average_count; // how many traces were averaged together (unimplemented == 1) void init(qubx_mac_param *param, qubx_mac_param_model *pmodel) { p = param; pm = pmodel; st = (qubx_mac_param_storage *) param->storage; m = pm->model; rst = (qubx_stim_rates_storage *) pm->ratesm->storage; A.resize(m->Nstate, m->Nstate); P0.resize(1, m->Nstate); Pt.resize(1, m->Nstate); varPt.resize(m->Nstate, m->Nstate); tmp_m.resize(m->Nstate, m->Nstate); tmp_v.resize(1, m->Nstate); tmp_s.resize(1, 1); tmp_diag.resize(m->Nstate, m->Nstate); average_count = 1; tmp_diag.clear(); if ( p->float_Nchannel > 1e-3 ) pm->float_Nchannel = p->float_Nchannel; // override from multi-datafile fit } matrix<double>& do_A(int sc, double t) { rst->setup_A(sc, t, tmp_diag, tmp_m, A); //cerr << "Q:" << endl << ((qubx_stim_rates_storage *) p->ratesm->storage)->QQ[sc].m << endl; //cerr << "A:" << endl << A << endl; return A; } void do_Pt(int sc, double t) { noalias(Pt) = prod(P0, do_A(sc, t)); } matrix<double>& do_P0(int sc) { if ( m->usePeq ) { QtoPe(rst->QQ[sc].m, P0); } else { memcpy(& P0(0,0), m->P0, m->Nstate * sizeof(double)); } return P0; } void advance_P0(int sc, double t) { P0 = prod(P0, do_A(sc, t)); } double do_mean_t(double mV) { noalias(tmp_s) = prod(Pt, pm->xamp.m); double mt = pm->float_Nchannel * tmp_s(0,0); if ( m->IeqFv ) { mt = ((mV - 1e-3*m->Vrev)*mt + (mV - 1e-3*m->Vleak)*m->Gleak); } return ( m->amp0 + mt ); } void do_var_Pt() // state variance { for (int i=0; i<m->Nstate; ++i) for (int j=0; j<m->Nstate; ++j) varPt(i,j) = (i == j) ? (Pt(0,i) * (1.0 - Pt(0,i))) : (- Pt(0,i) * Pt(0,j)); } double do_var_t(double dV) // total variance { noalias(tmp_v) = prod(trans(pm->xamp.m), varPt); noalias(tmp_s) = prod(tmp_v, pm->xamp.m); double v = tmp_s(0,0); noalias(tmp_s) = prod(Pt, pm->xvar.m); v += tmp_s(0,0); if ( m->IeqFv ) v *= dV * dV; v = (m->var0 + pm->float_Nchannel*v) / average_count; //// if ( v <= 0.0 ) v = 1e-30; return ( v ); } }; extern "C" MAXMLL_API int __stdcall qubx_mac_ll(qubx_mac_param *p) { int rtn; qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage; qubx_model *model = st->models[0].model; if ( p->accel > 1 ) #ifdef QUBX_MAC_OPENCL if ( (model->Nstate <= QUBX_OCL_MAX_NSTATE) && (((qubx_mac_data*) p->data)->Ndata >= QUBX_OCL_MIN_NDATA) && (st->models.size() == 1) ) { if ( ! (rtn = qubx_mac_ll_opencl(p)) ) return 0; else p->gpu_fail = 1; } #else p->gpu_fail = 1; #endif if ( p->accel > 0 ) if ( ! (rtn = qubx_mac_ll_openmp(p)) ) return 0; return qubx_mac_ll_basic(p); } int qubx_mac_ll_basic(qubx_mac_param *p) { print_timestamp("basic"); qubx_mac_data *macdata = (qubx_mac_data *) p->data; qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage; int& stop_flag = st->models[0].model->stop_flag; p->ll = 0.0; double logl = 0.0; double log2pi = log(2.0*M_PI); int Nmodel = (int) st->models.size(); { std::vector<qubx_mac_workspace> ws(Nmodel); // per omp thread for (int m=0; m<Nmodel; ++m) ws[m].init(p, & st->models[m]); for (size_t s=0; s < macdata->segments.size(); ++s) { for (int m=Nmodel-1; m>=0; --m) ws[m].do_P0(macdata->segments[s].pieces[0].stimclass); double seg_ll = 0.0; int total_points = 0; if ( (! stop_flag) && macdata->segments[s].pieces.size() ) { double dV = 0.0; double mV = 1.0; double piece_time = 0.0; std::vector<qubx_mac_data_piece>& pieces = macdata->segments[s].pieces; int npiece = (int) pieces.size(); for (int ip=0; ip<npiece; ++ip) { qubx_mac_data_piece *piece = &(pieces[ip]); if ( ! stop_flag ) { if ( ws[0].pm->model->IeqFv ) { // update dV mV = 1e-3 * ws[0].pm->ampm->stimclasses[piece->stimclass * ws[0].pm->ampm->Nsig + ws[0].pm->model->iVoltage]; dV = mV - 1e-3 * ws[0].pm->model->Vrev; } for (int isam = 0; isam < piece->count; ++isam) { qubx_mac_sample *sample = piece->data + isam; qubx_mac_sample_out *sample_out = piece->data_out + isam; double mean = 0.0; double var = 0.0; for (int m=Nmodel-1; m>=0; --m) { ws[m].do_Pt(piece->stimclass, sample->time); mean += ws[m].do_mean_t(mV); if ( p->useVar ) { ws[m].do_var_Pt(); var += ws[m].do_var_t(dV); } } sample_out->fit = (float) mean; double ss = sample->cur - mean; ss *= ss; if ( p->useVar ) { //if ( (p == 0) && (isam == 1) ) //cerr << "varPt:" << endl << ws.varPt << endl; sample_out->fit_var = (float) var; if ( var ) ss = log(var) + ss/var; sample_out->ll = (float) (-0.5 * (ss + log2pi)); } else { sample_out->ll = (float) (-ss); } seg_ll += ss; ++total_points; } piece_time += piece->dur; } for (int m=Nmodel-1; m>=0; --m) ws[m].advance_P0(piece->stimclass, piece->dur); } } logl -= seg_ll; if ( p->useVar ) { logl -= total_points * log2pi; } } } if ( p->useVar ) { logl *= 0.5; } p->ll = logl; if ( stop_flag ) { return -8; // stopped } print_timestamp("end basic"); return 0; // success } int qubx_mac_ll_openmp(qubx_mac_param *p) { print_timestamp("start openmp"); qubx_mac_data *macdata = (qubx_mac_data *) p->data; qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage; p->ll = 0.0; double logl = 0.0; double log2pi = log(2.0*M_PI); int Nmodel = (int) st->models.size(); int& stop_flag = st->models[0].model->stop_flag; int stop_read; #pragma omp parallel reduction (+: logl) shared(stop_read) { std::vector<qubx_mac_workspace> ws(Nmodel); // per omp thread for (int m=0; m<Nmodel; ++m) ws[m].init(p, & st->models[m]); for (size_t s=0; s < macdata->segments.size(); ++s) { for (int m=0; m<Nmodel; ++m) ws[m].do_P0(macdata->segments[s].pieces[0].stimclass); double seg_ll = 0.0; int total_points = 0; if ( macdata->segments[s].pieces.size() ) { double dV = 0.0; double mV = 1.0; double piece_time = 0.0; std::vector<qubx_mac_data_piece>& pieces = macdata->segments[s].pieces; int npiece = (int) pieces.size(); for (int ip=0; ip<npiece; ++ip) { qubx_mac_data_piece *piece = &(pieces[ip]); #pragma omp single { stop_read = stop_flag; } if ( ! stop_read ) { if ( ws[0].pm->model->IeqFv ) { // update dV mV = 1e-3 * ws[0].pm->ampm->stimclasses[piece->stimclass * ws[0].pm->ampm->Nsig + ws[0].pm->model->iVoltage]; dV = mV - 1e-3 * ws[0].pm->model->Vrev; } #pragma omp for for (int isam = 0; isam < piece->count; ++isam) { qubx_mac_sample *sample = piece->data + isam; qubx_mac_sample_out *sample_out = piece->data_out + isam; double mean = 0.0; double var = 0.0; for (int m=Nmodel-1; m>=0; --m) { ws[m].do_Pt(piece->stimclass, sample->time); mean += ws[m].do_mean_t(mV); if ( p->useVar ) { ws[m].do_var_Pt(); var += ws[m].do_var_t(dV); } } sample_out->fit = (float) mean; double ss = sample->cur - mean; ss *= ss; if ( p->useVar ) { sample_out->fit_var = (float) var; if ( var ) ss = log(var) + ss/var; sample_out->ll = (float) (-0.5 * (ss + log2pi)); } else { sample_out->ll = (float) (-ss); } seg_ll += ss; ++total_points; } piece_time += piece->dur; } for (int m=0; m<Nmodel; ++m) { ws[m].advance_P0(piece->stimclass, piece->dur); } } } logl -= seg_ll; if ( p->useVar ) { logl -= total_points * log2pi; } } } if ( p->useVar ) { logl *= 0.5; } p->ll = logl; if ( stop_flag ) { return -8; // stopped } print_timestamp("end openmp"); return 0; // success } /* as markup on rates.data to remain on card: mac_opencl_data_create(data, model): global: data[Ndata] maybe re-use signal variance as point_ll? global: seg_npiece[Nseg] global: seg_piece_dur[Nseg*maxNpiece] global: seg_piece_sc[Nseg*maxNpiece] // pad to 8 or 16 states? if i>= or j>= return immediately? per run: mac_opencl_create(data, model): global: QQ_eig, QQ_eigenvec, QQ_eiginv [Ns*Ns*Nsc] global: xamp, xvar, amp0, var0, Nchannel global: seg_piece_P0[Nseg*maxNpiece*Ns] global: seg_piece_dV, seg_piece_Ileak fill each seg_piece_P0[seg][0] mac_p0_kernel<1>(Nseg) [i] shared: P0 of seg start each piece[:-1]: [ij] shared: Q_eigenvec[sc], Q_eiginv[sc] [i] shared: eigx = Q_eig[sc] * x ---- [ij]k shared: Pt_i = P0_j * vec_jk * eigx_k * inv_ki ---- [i] {store Pt_i in seg_piece_P0[seg][piece+1] {P0_i = Pt_i ---- nwork(chunksize) = sum(ceil(piece_size / chunksize)) chunksize = first power of 2 | (nwork == nseg) or (nwork < 8*num_multiprocessors) workpieces = [(seg, piece, t0, n)] mac_point_kernel<Ns, Ns>(nwork) determine seg, piece, t-range all: sc [ij] shared: Q_eigenvec[sc], Q_eiginv[sc] [i] shared: xamp, xvar, Q_eig[sc], P0 shared: amp0, var0 ---- each t: [] all: data.x [ij] shared: tmp_ij = x * eig_i * inv_ij ---- [ij]k shared: Pt_i = P0_j * vec_jk * tmp_ki ---- [ij] varPt_ij = (i==j) ? Pt[i] * (1-Pt[i]) : - Pt[i] * Pt[j] [i] tmp_i = varPt_ik * xamp_k []k {data.mean = sum_k(Pt_k * xamp_k) [] {data.mean = data.mean * Nchannel * dV + Ileak + amp0 ---- []k {data.var = sum_k(xamp_k * tmp_k + Pt_k * xvar_k) { * dV { * Nchannel + var0 // all divided by averagecount [] {point_ll = (y - mean)**2 { =? point_ll / data.var + log(data.var) get data from device; sum point_ll data holds context or NULLs: context and whatever compiled kernels non-model-spec. global pointers work-pieces separate qubx_mac_ll_openmp; front qubx_mac_ll tries opencl_8 (auto 16 etc...) then _openmp conditional on opencl flag: implement or auto-fail: qubx_mac_ll_opencl_8: setup data.context etc if NULL setup model-spec. pointers compute seg P0 call P0 kernel call point kernel sum point LL (and seg LLs?) free model-spec. pointers */ #ifdef QUBX_MAC_OPENCL class qubx_mac_workslice { public: int seg, piece, offset, count; qubx_mac_workslice(int seg_, int piece_, int offset_, int count_) : seg(seg_), piece(piece_), offset(offset_), count(count_) {} }; int qubx_mac_ll_opencl(qubx_mac_param *p) { qubx_mac_data *dat = (qubx_mac_data *) p->data; qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage; qubx_model *model = st->models[0].model; int dim = st->ocl_dim; int kern_ix = ((dim == 4) ? QUBX_4 : ((dim == 8) ? QUBX_8 : QUBX_16)); print_timestamp("begin opencl"); int rtn = 0; if ( (rtn = qubx_mac_opencl_setup_context(dat)) ) return rtn; print_timestamp("have context"); if ( (rtn = qubx_mac_opencl_setup_data(dat)) ) return rtn; print_timestamp("sent data"); if ( (rtn = qubx_mac_opencl_setup_data_V(dat, p)) ) return rtn; print_timestamp("setup data V"); if ( (rtn = qubx_mac_opencl_setup_model(p, dat)) ) return rtn; print_timestamp("setup model"); if ( (rtn = qubx_mac_opencl_setup_P0(dat, p)) ) return rtn; print_timestamp("setup init. P0"); size_t local_dim[2] = {(size_t) dim, (size_t) dim}; size_t global_dim[2] = {(size_t) dim, (size_t) dim}; global_dim[0] = dim*dat->segments.size(); cl_kernel P0_kernel = dat->ocl64_P0_kernel[kern_ix] ? dat->ocl64_P0_kernel[kern_ix] : dat->ocl_P0_kernel[kern_ix]; cl_kernel point_kernel = dat->ocl64_point_kernel[kern_ix] ? dat->ocl64_point_kernel[kern_ix] : dat->ocl_point_kernel[kern_ix]; if ( (rtn = clSetKernelArg(P0_kernel, 0, sizeof(int), & model->Nstate)) ) { cerr << "clSetKernelArg(P0, 0): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 1, sizeof(int), & dat->maxNpiece)) ) { cerr << "clSetKernelArg(P0, 1): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 2, sizeof(int), & dat->Nstim)) ) { cerr << "clSetKernelArg(P0, 2): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 3, sizeof(cl_mem), & st->ocl_QQ_eig)) ) { cerr << "clSetKernelArg(P0, 3): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 4, sizeof(cl_mem), & st->ocl_QQ_evec)) ) { cerr << "clSetKernelArg(P0, 4): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 5, sizeof(cl_mem), & st->ocl_QQ_einv)) ) { cerr << "clSetKernelArg(P0, 5): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 6, sizeof(cl_mem), & dat->ocl_seg_npiece)) ) { cerr << "clSetKernelArg(P0, 6): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 7, sizeof(cl_mem), & dat->ocl_seg_piece_sc)) ) { cerr << "clSetKernelArg(P0, 7): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 8, sizeof(cl_mem), & dat->ocl_seg_piece_dur)) ) { cerr << "clSetKernelArg(P0, 8): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(P0_kernel, 9, sizeof(cl_mem), & dat->ocl_seg_piece_P0)) ) { cerr << "clSetKernelArg(P0, 9): error " << rtn << endl; return rtn; } print_timestamp("start P0 kernel"); if ( (rtn = clEnqueueNDRangeKernel(dat->ocl_queue, P0_kernel, 2, 0, (const size_t *) &global_dim, (const size_t *) &local_dim, 0, NULL, NULL)) ) { cerr << "clEnqueueNDRangeKernel(ocl_P0_kernel): error " << rtn << endl; return rtn; } print_timestamp("end P0 kernel"); global_dim[0] = dim*dat->ocl_slicecount; if ( (rtn = clSetKernelArg(point_kernel, 0, sizeof(int), & model->Nstate)) ) { cerr << "clSetKernelArg(point, 0): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 1, sizeof(int), & dat->maxNpiece)) ) { cerr << "clSetKernelArg(point, 1): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 2, sizeof(int), & dat->Nstim)) ) { cerr << "clSetKernelArg(point, 2): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 3, sizeof(int), & model->Nchannel)) ) { cerr << "clSetKernelArg(point, 3): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 4, sizeof(cl_mem), & st->ocl_QQ_eig)) ) { cerr << "clSetKernelArg(point, 4): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 5, sizeof(cl_mem), & st->ocl_QQ_evec)) ) { cerr << "clSetKernelArg(point, 5): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 6, sizeof(cl_mem), & st->ocl_QQ_einv)) ) { cerr << "clSetKernelArg(point, 6): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 7, sizeof(cl_mem), & dat->ocl_seg_npiece)) ) { cerr << "clSetKernelArg(point, 7): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 8, sizeof(cl_mem), & dat->ocl_seg_piece_sc)) ) { cerr << "clSetKernelArg(point, 8): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 9, sizeof(cl_mem), & dat->ocl_seg_piece_dur)) ) { cerr << "clSetKernelArg(point, 9): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 10, sizeof(cl_mem), & dat->ocl_seg_piece_P0)) ) { cerr << "clSetKernelArg(point, 10): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 11, sizeof(cl_mem), & dat->ocl_seg_piece_dV)) ) { cerr << "clSetKernelArg(point, 11): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 12, sizeof(cl_mem), & dat->ocl_seg_piece_Ileak)) ) { cerr << "clSetKernelArg(point, 12): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 13, sizeof(cl_mem), & st->ocl_xamp)) ) { cerr << "clSetKernelArg(point, 13): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 14, sizeof(cl_mem), & st->ocl_xvar)) ) { cerr << "clSetKernelArg(point, 14): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 15, sizeof(cl_mem), & dat->ocl_workslices)) ) { cerr << "clSetKernelArg(point, 15): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 16, sizeof(cl_mem), & dat->ocl_samples)) ) { cerr << "clSetKernelArg(point, 16): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 17, sizeof(cl_mem), & dat->ocl_samples_out)) ) { cerr << "clSetKernelArg(point, 17): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 18, sizeof(float), & st->ocl_amp0)) ) { cerr << "clSetKernelArg(point, 18): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 19, sizeof(float), & st->ocl_var0)) ) { cerr << "clSetKernelArg(point, 19): error " << rtn << endl; return rtn; } if ( (rtn = clSetKernelArg(point_kernel, 20, sizeof(cl_mem), & dat->ocl_dbg)) ) { cerr << "clSetKernelArg(point, 20): error " << rtn << endl; return rtn; } print_timestamp("start point kernel"); if ( (rtn = clEnqueueNDRangeKernel(dat->ocl_queue, point_kernel, 2, 0, (const size_t *) &global_dim, (const size_t *) &local_dim, 0, NULL, NULL)) ) { cerr << "clEnqueueNDRangeKernel(ocl_point_kernel): error " << rtn << endl; return rtn; } print_timestamp("end point kernel"); /* if ( (rtn = clEnqueueReadBuffer(dat->ocl_queue, dat->ocl_dbg, CL_TRUE, 0, 9*8*sizeof(float), st->transfer, 0, NULL, NULL)) ) { cerr << "clEnqueueReadBuffer(ocl_dbg): error " << rtn << endl; return rtn; } for (int i=0; i<9; ++i) { for (int j=0; j<8; ++j) { cerr << st->transfer[8*i+j] << '\t'; } cerr << endl; } */ double acc = 0.0; if ( (rtn = clEnqueueReadBuffer(dat->ocl_queue, dat->ocl_samples_out, CL_TRUE, 0, dat->Ndata*sizeof(qubx_mac_sample_out), dat->data_out, 0, NULL, NULL)) ) { cerr << "clEnqueueReadBuffer(ocl_samples_out): error " << rtn << endl; return rtn; } print_timestamp("read data"); qubx_mac_sample_out *result = dat->data_out; double log2pi = log(2.0*M_PI); #pragma omp parallel for reduction (+: acc) for (int i=0; i<dat->Ndata; ++i) { double pointSS = result[i].ll; acc += pointSS; result[i].ll = (float) (-0.5 * (pointSS + log2pi)); } print_timestamp("summed ll"); // if not usevar: *ll = - acc; p->ll = - 0.5 * (acc + dat->Ndata * log2pi); return 0; // success } int qubx_mac_opencl_setup_context(qubx_mac_data *data) { if ( data->ocl_context ) return 0; //already done cl_platform_id plat_ids[3]; cl_device_id dev_ids[3]; cl_context_properties props[3]; cl_uint n, ndesc; cl_platform_id p_max = 0; cl_device_id d_max = 0; cl_uint n_max = 0; cl_uint compunits; int errcode = 0; if ( (errcode = clGetPlatformIDs(3, plat_ids, &n)) ) { cerr << "clGetPlatformIDs: error " << errcode << endl; goto RTN_FAIL; } for (cl_uint i = 0; i < n; i++) { if ( (errcode = clGetDeviceIDs(plat_ids[i], CL_DEVICE_TYPE_GPU, 3*sizeof(cl_device_id), dev_ids, &ndesc)) ) { cerr << "clGetDeviceIDs(platform " << i << "): error " << errcode << endl; goto RTN_FAIL; } for (size_t d=0; d<ndesc; ++d) { if ( (errcode = clGetDeviceInfo(dev_ids[d], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &compunits, NULL)) ) { cerr << "clGetDeviceInfo: error " << errcode << endl; goto RTN_FAIL; } //cerr << "Platform " << i << ", device " << d << ": " << compunits << " comp units" << endl; if ( compunits > n_max ) { n_max = compunits; d_max = dev_ids[d]; p_max = plat_ids[i]; } } } data->ocl_device = d_max; data->ocl_compunits = n_max; props[0] = CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties)p_max; props[2] = 0; //char exts[1000]; //clGetDeviceInfo(data->ocl_device, CL_DEVICE_EXTENSIONS, 1000, exts, NULL); //cerr << "Extensions: " << exts << endl; data->ocl_context = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); if ( errcode ) { cerr << "clCreateContextFromType: error " << errcode << endl; goto RTN_FAIL; } data->ocl_queue = clCreateCommandQueue(data->ocl_context, data->ocl_device, 0, &errcode); if ( errcode ) { cerr << "clCreateCommandQueue: error " << errcode << endl; goto RTN_FAIL; } data->ocl_program = clCreateProgramWithSource(data->ocl_context, 1, &s_mac_kernels, NULL, &errcode); if ( errcode ) { cerr << "clCreateProgramWithSource: error " << errcode << endl; goto RTN_FAIL; } errcode = clBuildProgram(data->ocl_program, 1, &data->ocl_device, 0, 0, 0); if ( errcode ) { cerr << "clBuildProgram: error " << errcode << endl; /* doesn't work: char build_msg[1000]; clGetProgramBuildInfo(data->ocl_program, data->ocl_device, CL_PROGRAM_BUILD_LOG, 1000, build_msg, 0); cerr << build_msg << endl; */ goto RTN_FAIL; } data->ocl_P0_kernel[QUBX_4] = clCreateKernel(data->ocl_program, "mac_P0_kernel_4", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac_P0_kernel_4): error " << errcode << endl; goto RTN_FAIL; } data->ocl_point_kernel[QUBX_4] = clCreateKernel(data->ocl_program, "mac_point_kernel_4", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac_point_kernel_4): error " << errcode << endl; goto RTN_FAIL; } data->ocl_P0_kernel[QUBX_8] = clCreateKernel(data->ocl_program, "mac_P0_kernel_8", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac_P0_kernel_8): error " << errcode << endl; goto RTN_FAIL; } data->ocl_point_kernel[QUBX_8] = clCreateKernel(data->ocl_program, "mac_point_kernel_8", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac_point_kernel_8): error " << errcode << endl; goto RTN_FAIL; } data->ocl_P0_kernel[QUBX_16] = clCreateKernel(data->ocl_program, "mac_P0_kernel_16", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac_P0_kernel_16): error " << errcode << endl; goto RTN_FAIL; } data->ocl_point_kernel[QUBX_16] = clCreateKernel(data->ocl_program, "mac_point_kernel_16", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac_point_kernel_16): error " << errcode << endl; goto RTN_FAIL; } /* double precision is 3x slower on tesla c1060, comparable to quad-core cpu data->ocl64_program = clCreateProgramWithSource(data->ocl_context, 1, &s_mac64_kernels, NULL, &errcode); if ( errcode ) { cerr << "clCreateProgramWithSource(64): error " << errcode << endl; return 0; // good enough } errcode = clBuildProgram(data->ocl64_program, 1, &data->ocl_device, 0, 0, 0); if ( errcode ) { cerr << "clBuildProgram(64): error " << errcode << endl; char build_msg[1000]; clGetProgramBuildInfo(data->ocl64_program, data->ocl_device, CL_PROGRAM_BUILD_LOG, 1000, build_msg, 0); cerr << build_msg << endl; return 0; // good enough } data->ocl64_P0_kernel[QUBX_4] = clCreateKernel(data->ocl64_program, "mac64_P0_kernel_4", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac64_P0_kernel_4): error " << errcode << endl; return 0; // good enough } data->ocl64_point_kernel[QUBX_4] = clCreateKernel(data->ocl64_program, "mac64_point_kernel_4", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac64_point_kernel_4): error " << errcode << endl; return 0; // good enough } data->ocl64_P0_kernel[QUBX_8] = clCreateKernel(data->ocl64_program, "mac64_P0_kernel_8", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac64_P0_kernel_8): error " << errcode << endl; return 0; // good enough } data->ocl64_point_kernel[QUBX_8] = clCreateKernel(data->ocl64_program, "mac64_point_kernel_8", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac64_point_kernel_8): error " << errcode << endl; return 0; // good enough } data->ocl64_P0_kernel[QUBX_16] = clCreateKernel(data->ocl64_program, "mac64_P0_kernel_16", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac64_P0_kernel_16): error " << errcode << endl; return 0; // good enough } data->ocl64_point_kernel[QUBX_16] = clCreateKernel(data->ocl64_program, "mac64_point_kernel_16", &errcode); if ( errcode ) { cerr << "clCreateKernel(mac64_point_kernel_16): error " << errcode << endl; } */ return 0; RTN_FAIL: qubx_mac_opencl_cleanup_context(data); return errcode; } void qubx_mac_opencl_cleanup_context(qubx_mac_data *data) { for (int i=0; i<3; ++i) { if ( data->ocl_point_kernel[i] ) { clReleaseKernel(data->ocl_point_kernel[i]); data->ocl_point_kernel[i] = NULL; } if ( data->ocl_P0_kernel[i] ) { clReleaseKernel(data->ocl_P0_kernel[i]); data->ocl_P0_kernel[i] = NULL; } if ( data->ocl64_point_kernel[i] ) { clReleaseKernel(data->ocl64_point_kernel[i]); data->ocl64_point_kernel[i] = NULL; } if ( data->ocl64_P0_kernel[i] ) { clReleaseKernel(data->ocl64_P0_kernel[i]); data->ocl64_P0_kernel[i] = NULL; } } if ( data->ocl_program ) { clReleaseProgram(data->ocl_program); data->ocl_program = NULL; } if ( data->ocl64_program ) { clReleaseProgram(data->ocl64_program); data->ocl64_program = NULL; } if ( data->ocl_queue ) { clReleaseCommandQueue(data->ocl_queue); data->ocl_queue = NULL; } if ( data->ocl_context ) { clReleaseContext(data->ocl_context); data->ocl_context = NULL; } } int qubx_mac_opencl_setup_workslices(qubx_mac_data *data) { int chunk = SMALLEST_CHUNK; int count; int total_pieces; int errcode; do { count = 0; total_pieces = 0; for (size_t s=0; s<data->segments.size(); ++s) { for (size_t p=0; p<data->segments[s].pieces.size(); ++p) { if ( data->segments[s].pieces[p].data ) { int pcount = data->segments[s].pieces[p].count; count += (pcount / chunk) + ((pcount % chunk) ? 1 : 0); ++total_pieces; } } } if ( (count == total_pieces) || (count < (int) (OVERLOADING*data->ocl_compunits)) ) break; else chunk *= 2; } while ( 1 ); std::vector<qubx_mac_workslice> slices; for (size_t s=0; s<data->segments.size(); ++s) { for (size_t p=0; p<data->segments[s].pieces.size(); ++p) { qubx_mac_data_piece& piece = data->segments[s].pieces[p]; if ( piece.data ) { int piece_off = (int) (piece.data - data->data); for (int offset=0; offset<piece.count; offset += chunk) { slices.push_back( qubx_mac_workslice((int)s, (int)p, piece_off + offset, min(chunk, piece.count - offset)) ); } } } } data->ocl_workslices = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, slices.size() * sizeof(qubx_mac_workslice), & slices[0], &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_workslices): error " << errcode << endl; return errcode; } data->ocl_slicecount = (int) slices.size(); //cerr << "slice count: " << data->ocl_slicecount << endl; //cerr << "chunk size: " << chunk << endl; return 0; } int qubx_mac_opencl_setup_data(qubx_mac_data *data) { if ( data->ocl_workslices ) return 0; // already set up int errcode; int transfer_bytes = (int) (data->segments.size() * data->maxNpiece * 2 * sizeof(float)); data->transfer = new float[(transfer_bytes / sizeof(float)) + ((transfer_bytes % sizeof(float)) ? 1 : 0)]; float *fbuf = data->transfer; int *ibuf = (int *) fbuf; for (size_t s=0; s<data->segments.size(); ++s) ibuf[s] = (int) data->segments[s].pieces.size(); data->ocl_seg_npiece = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, data->segments.size()*sizeof(int), ibuf, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_seg_npiece): error " << errcode << endl; return errcode; } for (size_t s=0; s<data->segments.size(); ++s) { float *fbuf_s = fbuf + s*data->maxNpiece; for (size_t p=0; p<data->segments[s].pieces.size(); ++p) { fbuf_s[p] = (float) data->segments[s].pieces[p].dur; } } data->ocl_seg_piece_dur = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, data->segments.size()*data->maxNpiece*sizeof(float), fbuf, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_seg_piece_dur): error " << errcode << endl; return errcode; } for (size_t s=0; s<data->segments.size(); ++s) { int *ibuf_s = ibuf + s*data->maxNpiece; for (size_t p=0; p<data->segments[s].pieces.size(); ++p) { ibuf_s[p] = data->segments[s].pieces[p].stimclass; } } data->ocl_seg_piece_sc = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, data->segments.size()*data->maxNpiece*sizeof(int), ibuf, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_seg_piece_sc): error " << errcode << endl; return errcode; } data->ocl_seg_piece_P0 = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE, data->segments.size()*data->maxNpiece*QUBX_OCL_MAX_NSTATE*sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_seg_piece_P0): error " << errcode << endl; return errcode; } data->ocl_seg_piece_dV = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE, data->segments.size()*data->maxNpiece*sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_seg_piece_dV): error " << errcode << endl; return errcode; } data->ocl_seg_piece_Ileak = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE, data->segments.size()*data->maxNpiece*sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_seg_piece_Ileak): error " << errcode << endl; return errcode; } data->ocl_samples = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, data->Ndata*sizeof(qubx_mac_sample), data->data, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_samples): error " << errcode << endl; return errcode; } data->ocl_samples_out = clCreateBuffer(data->ocl_context, CL_MEM_WRITE_ONLY, data->Ndata*sizeof(qubx_mac_sample_out), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_samples_out): error " << errcode << endl; return errcode; } data->ocl_dbg = clCreateBuffer(data->ocl_context, CL_MEM_READ_WRITE, QUBX_OCL_MAX_NSTATE*QUBX_OCL_MAX_NSTATE*sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_dbg): error " << errcode << endl; return errcode; } return qubx_mac_opencl_setup_workslices(data); } void qubx_mac_opencl_cleanup_data(qubx_mac_data *data) { clReleaseMemObject(data->ocl_workslices); data->ocl_workslices = NULL; clReleaseMemObject(data->ocl_dbg); data->ocl_dbg = NULL; clReleaseMemObject(data->ocl_samples_out); data->ocl_samples_out = NULL; clReleaseMemObject(data->ocl_samples); data->ocl_samples = NULL; clReleaseMemObject(data->ocl_seg_piece_P0); data->ocl_seg_piece_P0 = NULL; clReleaseMemObject(data->ocl_seg_piece_dV); data->ocl_seg_piece_dV = NULL; clReleaseMemObject(data->ocl_seg_piece_Ileak); data->ocl_seg_piece_Ileak = NULL; clReleaseMemObject(data->ocl_seg_piece_sc); data->ocl_seg_piece_sc = NULL; clReleaseMemObject(data->ocl_seg_piece_dur); data->ocl_seg_piece_dur = NULL; clReleaseMemObject(data->ocl_seg_npiece); data->ocl_seg_npiece = NULL; delete [] data->transfer; data->sent_v = 0; } int qubx_mac_opencl_setup_model(qubx_mac_param *p, qubx_mac_data *data) { int errcode; qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage; qubx_model *model = st->models[0].model; qubx_stim_rates *ratesm = st->models[0].ratesm; int dim = st->ocl_dim; if ( ! st->ocl_xvar ) { int transfer_bytes = max((int) (QUBX_OCL_MAX_NSTATE * QUBX_OCL_MAX_NSTATE * 2 * data->Nstim * sizeof(float)), (int) (data->segments.size() * data->maxNpiece * sizeof(float))); st->transfer = new float[(transfer_bytes / sizeof(float)) + ((transfer_bytes % sizeof(float)) ? 1 : 0)]; st->ocl_QQ_eig = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY, dim * data->Nstim * sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_QQ_eig): error " << errcode << endl; return errcode; } st->ocl_QQ_evec = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY, dim * dim * data->Nstim * sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_QQ_evec): error " << errcode << endl; return errcode; } st->ocl_QQ_einv = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY, dim * dim * data->Nstim * sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_QQ_einv): error " << errcode << endl; return errcode; } st->ocl_xamp = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY, dim * sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_xamp): error " << errcode << endl; return errcode; } st->ocl_xvar = clCreateBuffer(data->ocl_context, CL_MEM_READ_ONLY, dim * sizeof(float), NULL, &errcode); if ( errcode ) { cerr << "clCreateBuffer(ocl_xvar): error " << errcode << endl; return errcode; } } float *fbuf = st->transfer; float *fbuf_row; fbuf_row = fbuf; for (int sc=0; sc<data->Nstim; ++sc, fbuf_row += dim) { int i = 0; for (; i<model->Nstate; ++i) fbuf_row[i] = (float) ratesm->QQ_eig[sc][i]; for (; i<dim; ++i) fbuf_row[i] = 0.0f; } if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_QQ_eig, CL_TRUE, 0, data->Nstim*dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_QQ_eig): error " << errcode << endl; return errcode; } fbuf_row = fbuf; for (int sc=0; sc<data->Nstim; ++sc) { int r = 0; for (; r<model->Nstate; ++r, fbuf_row += dim) { int c = 0; for (; c<model->Nstate; ++c) fbuf_row[c] = (float) ratesm->QQ_evec[sc][r][c]; for (; c<dim; ++c) fbuf_row[c] = 0.0f; } for (; r<dim; ++r, fbuf_row += dim) memset(fbuf_row, 0, dim*sizeof(float)); } if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_QQ_evec, CL_FALSE, 0, data->Nstim*dim*dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_QQ_evec): error " << errcode << endl; return errcode; } // fbuf_row = fbuf; for (int sc=0; sc<data->Nstim; ++sc) { int r = 0; for (; r<model->Nstate; ++r, fbuf_row += dim) { int c = 0; for (; c<model->Nstate; ++c) fbuf_row[c] = (float) ratesm->QQ_einv[sc][r][c]; for (; c<dim; ++c) fbuf_row[c] = 0.0f; } for (; r<dim; ++r, fbuf_row += dim) memset(fbuf_row, 0, dim*sizeof(float)); if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_QQ_einv, CL_TRUE, 0, data->Nstim*dim*dim*sizeof(float), fbuf+data->Nstim*dim*dim, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_QQ_einv): error " << errcode << endl; return errcode; } } memset(fbuf, 0, 16*sizeof(float)); for (int i=0; i<model->Nstate; ++i) { fbuf[i] = (float) st->models[0].xamp.p[i]; fbuf[dim+i] = (float) st->models[0].xvar.p[i]; } if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_xamp, CL_FALSE, 0, dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_xamp): error " << errcode << endl; return errcode; } if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, st->ocl_xvar, CL_TRUE, 0, dim*sizeof(float), fbuf+dim, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_xvar): error " << errcode << endl; return errcode; } st->ocl_amp0 = (float) model->amp0; st->ocl_var0 = (float) model->var0; return 0; } void qubx_mac_opencl_cleanup_model(qubx_mac_param_storage *st) { clReleaseMemObject(st->ocl_xvar); st->ocl_xvar = NULL; clReleaseMemObject(st->ocl_xamp); st->ocl_xamp = NULL; clReleaseMemObject(st->ocl_QQ_eig); st->ocl_QQ_eig = NULL; clReleaseMemObject(st->ocl_QQ_evec); st->ocl_QQ_evec = NULL; clReleaseMemObject(st->ocl_QQ_einv); st->ocl_QQ_einv = NULL; delete [] st->transfer; } int qubx_mac_opencl_setup_data_V(qubx_mac_data *data, qubx_mac_param *p) { if ( data->sent_v ) return 0; qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage; qubx_model *model = st->models[0].model; qubx_stim_amps *ampm = st->models[0].ampm; float Vrev = (float) (1e-3 * model->Vrev); float Vleak = (float) (1e-3 * model->Vleak); float Gleak = (float) model->Gleak; float *fbuf = data->transfer; float *fbuf2 = data->transfer + data->segments.size() * data->maxNpiece; int errcode = 0; if ( model->IeqFv ) { for (size_t s=0; s<data->segments.size(); ++s) { float *fbuf_s = fbuf + s * data->maxNpiece; float *fbuf2_s = fbuf2 + s * data->maxNpiece; for (size_t ip=0; ip<data->segments[s].pieces.size(); ++ip) { float V = (float) (1e-3 * ampm->stimclasses[data->segments[s].pieces[ip].stimclass * ampm->Nsig + model->iVoltage]); fbuf_s[ip] = V - Vrev; fbuf2_s[ip] = (V - Vleak) * Gleak; } } } else { memset(fbuf, 0, data->segments.size() * data->maxNpiece * 2 * sizeof(float)); for (size_t s=0; s<data->segments.size(); ++s) { float *fbuf_s = fbuf + s * data->maxNpiece; for (size_t ip=0; ip<data->segments[s].pieces.size(); ++ip) fbuf_s[ip] = 1.0; } } if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, data->ocl_seg_piece_dV, CL_FALSE, 0, data->segments.size() * data->maxNpiece * sizeof(float), fbuf, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_seg_piece_dV): error " << errcode << endl; return errcode; } if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, data->ocl_seg_piece_Ileak, CL_TRUE, 0, data->segments.size() * data->maxNpiece * sizeof(float), fbuf2, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_seg_piece_Ileak): error " << errcode << endl; return errcode; } data->sent_v = 1; return 0; } int qubx_mac_opencl_setup_P0(qubx_mac_data *data, qubx_mac_param *p) { int errcode; qubx_mac_param_storage *st = (qubx_mac_param_storage *) p->storage; qubx_model *model = st->models[0].model; qubx_stim_rates *ratesm = st->models[0].ratesm; qubx_stim_rates_storage *ratesm_st = (qubx_stim_rates_storage *) ratesm->storage; matrix<double> P0(1, model->Nstate); int dim = st->ocl_dim; // each seg: find P0 (equil?), put in model->transfer, write to data->ocl_seg_piece_P0 +s*maxNpiece*dim*float if ( ! model->usePeq ) { memset(st->transfer, 0, dim*sizeof(float)); for (int i=0; i<model->Nstate; ++i) st->transfer[i] = (float) model->P0[i]; } for (size_t s=0; s<data->segments.size(); ++s) { if ( model->usePeq ) { memset(st->transfer, 0, dim*sizeof(float)); QtoPe(ratesm_st->QQ[data->segments[s].pieces[0].stimclass].m, P0); for (int i=0; i<model->Nstate; ++i) st->transfer[i] = (float) P0(0,i); } if ( (errcode = clEnqueueWriteBuffer(data->ocl_queue, data->ocl_seg_piece_P0, CL_TRUE, s*data->maxNpiece*dim*sizeof(float), dim*sizeof(float), st->transfer, 0, NULL, NULL)) ) { cerr << "clEnqueueWriteBuffer(ocl_seg_piece_P0): error " << errcode << endl; return errcode; } } return 0; } #endif // TODO: video, email