max_subi_ll.cpp.html mathcode2html   
 Source file:   max_subi_ll.cpp
 Converted:   Fri Dec 15 2017 at 14:08:19
 This documentation file will not reflect any later changes in the source file.

$$\phantom{******** If you see this on the webpage then the browser could not locate *********}$$
$$\phantom{******** jsMath/easy/load.js or the variable root is set wrong in this file *********}$$
$$\newcommand{\vector}[1]{\left[\begin{array}{c} #1 \end{array}\right]}$$ $$\newenvironment{matrix}{\left[\begin{array}{cccccccccc}} {\end{array}\right]}$$ $$\newcommand{\A}{{\cal A}}$$ $$\newcommand{\W}{{\cal W}}$$

/* Copyright 2008-2014 Research Foundation State University of New York   */

/* This file is part of QUB Express.                                      */

/* QUB Express is free software; you can redistribute it and/or modify    */
/* it under the terms of the GNU General Public License as published by   */
/* the Free Software Foundation, either version 3 of the License, or      */
/* (at your option) any later version.                                    */

/* QUB Express is distributed in the hope that it will be useful,         */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of         */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          */
/* GNU General Public License for more details.                           */

/* You should have received a copy of the GNU General Public License,     */
/* named LICENSE.txt, in the QUB Express program directory.  If not, see  */
/* <http://www.gnu.org/licenses/>.                                        */

/*

See also:

Up: Index

*/

#ifdef _WIN32
  #include <windows.h>
  #include <float.h>
  #define finite _finite
#else
  #include <stdlib.h>
  #define BOOL int
  #define TRUE 1
  #define FALSE 0
#endif

#include <string.h>
#include <iostream>
#include <fstream>

#define _USE_MATH_DEFINES
#include <math.h>
#include <string.h>
#include <set>
#include <map>
#include <vector>
#include <algorithm>

#ifdef QUBX_MAC_OPENCL
  #ifdef __APPLE__ // mac os x:
    #include <OpenCL/opencl.h>
  #else // linux/nvidia:
    #include <CL/cl.h>
  #endif
#endif

#include "max_subi_ll.h"
#include "max_inter_ll.h"
#include "../qubfast/max_ll_util.h"
#include "../qubfast/ublas_matrixutil.h"
#include "../qubfast/callbk_reportstream.h"

using std::cout;
using std::cerr;
using std::endl;
using std::set;
using std::map;
using std::pair;
using std::copy;
using std::max;

#ifdef _WIN32
  #include "../qubfast/gettimeofday.h"
#else
  #include <sys/time.h>
#endif
void msl_print_timestamp(MAXILL_VAR_NOT_USED
			 const char *lbl)
{
  /*
  struct timeval tm;
  gettimeofday(&tm, NULL);
  cerr << "--- " << tm.tv_sec << " : " << (tm.tv_usec / 1000) << "." << (tm.tv_usec % 1000) << " --- " << lbl << endl;
  */
}

int msl_print_timestamp_from_stream(const char *msg, MAXILL_VAR_NOT_USED void *data)
{
  msl_print_timestamp(msg);
  return 0;
}

callbk_reportstream ptime(msl_print_timestamp_from_stream, NULL);

#define MSL_SUPER_EIGEN_SSD_FAIL 0.1

double msl_super_eigen_test(matrix<double>& Q, double *eig_re, MAXILL_VAR_NOT_USED double *eig_im, matrix<double>& vecs, matrix<double>& invecs)
{
  int N = (int) Q.size1();
  double ssd = 0.0;
  double Qmax = 1e-3;
  matrix<double> Ulambda(N, N), Qtest(N, N);
  for (int i=N-1; i>=0; --i)
    for (int j=N-1; j>=0; --j)
      Ulambda(i,j) = vecs(i,j) * eig_re[j];
  noalias(Qtest) = prod(Ulambda, invecs);
  for (int i=N-1; i>=0; --i) {
    Qmax = max(Qmax, -Q(i,i));
    for (int j=N-1; j>=0; --j) {
      double diff = Qtest(i,j) - Q(i,j);
      ssd += diff*diff;
    }
  }
  //if ( ((ssd / (N*N)) / (Qmax*Qmax)) >= MSL_SUPER_EIGEN_SSD_FAIL )
  //  cerr << "Qtest: " << Qtest << endl;
  return ( (ssd / (N*N)) / (Qmax*Qmax) );
}

int msl_super_eigen(matrix<double> &Q, double *eig_re, double *eig_im, matrix<double>& vecs, matrix<double>& invecs, eigen_func custom_eigen, std::ostream& output)
{
  int err = ! eigen_vals_vecs(Q, eig_re, eig_im, vecs, invecs, output);
  double ssd = 0.0;
  if ( ! err ) {
    ssd = msl_super_eigen_test(Q, eig_re, eig_im, vecs, invecs);
    err = (ssd >= MSL_SUPER_EIGEN_SSD_FAIL) ? -100 : 0;
    //if ( err )
    //  cerr << "builtin eigen fail: " << ssd << endl;
  }
  //else
  //  cerr << "builtin eigen err: " << err << endl;
  if ( err && custom_eigen ) {
    err = custom_eigen((int) Q.size1(), & Q(0,0), eig_re, eig_im, & vecs(0,0), & invecs(0,0));
    if ( ! err ) {
      ssd = msl_super_eigen_test(Q, eig_re, eig_im, vecs, invecs);
      err = (ssd >= MSL_SUPER_EIGEN_SSD_FAIL) ? -101 : 0;
    }
    //if ( err ) {
    //  cerr << "err in fallback eigen; err: " << err << "  (ssd/element)/Qmax: " << ssd << endl;
    //  cerr << Q << endl;
    //}
  }
  return err;
}

extern "C" MAXILL_API void msl_multiplex_bounds(int Nseg, int Nsignal,
							  int **dwellCountses, int ***classeseses,
							  MAXILL_VAR_NOT_USED float ***durationseses, MAXILL_VAR_NOT_USED double ***ampseses,
							  int *out_dwellCounts, int *out_Nplex, int *out_Nstim)
{
  // First we need to know Nc[s] -- max number of classes in signal s
  typedef std::vector< set<int> > setlist;
  setlist classets(Nsignal);
  for (int seg=0; seg<Nseg; ++seg) {
    int *dwellCounts = dwellCountses[seg];
    int **classeses = classeseses[seg];
    for (int sig=0; sig<Nsignal; ++sig) {
      set<int>& classet = classets[sig];
      int *classes_end = classeses[sig]+dwellCounts[sig];
      for (int *cl=classeses[sig]; cl != classes_end; ++cl)
	classet.insert(*cl);
    }
  }
  std::vector<int> Nc;
  for (setlist::iterator csi=classets.begin(); csi != classets.end(); ++csi)
    Nc.push_back((int) csi->size());

  // If every combination of stimuli actually occurs, there will be
  //   Nc[1] * Nc[2] * ... stimclasses
  *out_Nstim = 1;
  for (int i=1; i<Nsignal; ++i)
    *out_Nstim *= Nc[i];
  
  // If every combination of stimclass and Markov-class occurs, there will be
  //   Nstim * Nc[0] plexiclasses
  *out_Nplex = *out_Nstim * Nc[0];
  
  // If none of the transitions on any signals coincide in time, there will be fewer than
  //   dwellCounts[0] + dwellCounts[1] + ... events in a segment
  for (int seg=0; seg<Nseg; ++seg) {
    out_dwellCounts[seg] = 0;
    for (int sig=0; sig<Nsignal; ++sig)
      out_dwellCounts[seg] += dwellCountses[seg][sig];
  }
}


typedef pair<int,int> plex;
typedef map<plex, int> plexmap;
typedef map<std::vector<double>, int> stimmap;

void msl_multiplex_segment(int Nsignal, int *dwellCounts, int **classeses, float **durationses,
			   double **ampses, int *out_dwellCount, int *out_classes, float *out_durations,
			   plexmap& plexcls_of, stimmap& stimcls_of)
{
  if ( ! Nsignal )
    return;
  // the key to stimmap
  std::vector<double> amps(Nsignal);
  // output
  int &Nd = *out_dwellCount;
  Nd = 0;
  // per-signal stuff
  std::vector<int> finger(Nsignal, 0);
  std::vector<int> clss(Nsignal);
  std::vector<float> remain(Nsignal);
  for (int i=0; i<Nsignal; ++i) {
    if ( ! dwellCounts[i] )
      return;
    clss[i] = classeses[i][0];
    remain[i] = durationses[i][0];
    if ( i ) { // the Markov class isn't part of stimclass
      if ( clss[i] >= 0 )
	amps[i] = ampses[i][ clss[i] ];
      else
	amps[i] = 0.0; // gap at beginning of stimulus: should complain and fail; instead silently defaulting to zero
    }
  }
  while ( 1 ) {
    // until any signal ends
    bool done = false;
    for (int i=0; i<Nsignal; ++i)
      if ( finger[i] >= dwellCounts[i] )
	done = true;
    if ( done )
      break;
    // find signal with shortest remaining
    float tm = remain[0];
    //int changing = 0;
    for (int i=0; i<Nsignal; ++i) {
      if ( remain[i] < tm ) {
	tm = remain[i];
	//changing = i;
      }
    }
    // look up stim- and plexi-classes
    int sc, pc;
    stimmap::iterator sti = stimcls_of.find(amps);
    if ( sti == stimcls_of.end() ) {
      sc = (int) stimcls_of.size();
      stimcls_of[amps] = sc;
    } else {
      sc = stimcls_of[amps];
    }
    plex plx(clss[0], sc);
    plexmap::iterator pli = plexcls_of.find(plx);
    if ( pli == plexcls_of.end() ) {
      pc = (int) plexcls_of.size();
      plexcls_of[plx] = pc;
    } else {
      pc = plexcls_of[plx];
    }
    // append the event
    out_classes[Nd] = pc;
    out_durations[Nd] = tm;
    ++Nd;
    // shorten all signals remain, and move the finger for any that are 0
    for (int i=0; i<Nsignal; ++i) {
      remain[i] -= tm;
      if ( remain[i] <= 0.0 ) {
	if ( (++finger[i]) < dwellCounts[i] ) {
	  remain[i] = durationses[i][ finger[i] ];
	  clss[i] = classeses[i][ finger[i] ];
	  if ( i )
	    if ( clss[i] >= 0 )
	      amps[i] = ampses[i][ clss[i] ];
	    // else: gap: keep the last event's amp
	}
      }
    }
  }
}

extern "C" MAXILL_API void msl_multiplex(int Nseg, int Nsignal,
						   int **dwellCountses, int ***classeseses,
						   float ***durationseses, double ***ampseses,
						   double tdead_ms, int *out_dwellCounts,
						   int **out_classeses, float **out_durationses,
						   int *out_Nplex, int *out_plexiclasses,
						   int *out_Nstim, double *out_stimclasses)
{
  // We use the same plexi- and stim-classes for all segments, numbered in order of appearance.
  // We'll collect the indices in these maps
  plexmap plexcls_of;
  stimmap stimcls_of;
  
  for (int seg=0; seg<Nseg; ++seg)
    msl_multiplex_segment(Nsignal, dwellCountses[seg], classeseses[seg], durationseses[seg],
			  ampseses[seg], out_dwellCounts+seg, out_classeses[seg],
			  out_durationses[seg], plexcls_of, stimcls_of);

  // apply dead time
  mil_prep_segments(Nseg, out_dwellCounts, out_classeses, out_durationses, tdead_ms);

  // and reverse s- and p-classes out
  *out_Nplex = (int) plexcls_of.size();
  for (plexmap::iterator plexi = plexcls_of.begin(); plexi != plexcls_of.end(); ++plexi) {
    int i = plexi->second * 2;
    out_plexiclasses[i] = plexi->first.first;
    out_plexiclasses[i+1] = plexi->first.second;
  }
  *out_Nstim = 0;
  for (stimmap::iterator stimi = stimcls_of.begin(); stimi != stimcls_of.end(); ++stimi) {
    int i = stimi->second * Nsignal;
    copy(stimi->first.begin(), stimi->first.end(), out_stimclasses + i);
	++(*out_Nstim);
  }
}



extern "C" MAXILL_API int subi_ll_arr(
				      int Ns, int *clazz, double *P0_in,  double *K0_in, double *K1_in, double *K2_in,
	int *Ligand_in, int *Voltage_in, int *Pressure_in,
	double tdead_sec, int Nseg, int *dwellCounts, int **classeses, float **durationses,
	int Nsig, int Nplex, int *plexiclasses, int Nstim, double *stimclasses,
	double *ll, eigen_func custom_eigen)
{
  std::vector<double*> K0, K1, K2;
  std::vector<int*> Lig, Vol, Pres;
  for (int i=0; i<Ns; ++i) {
    K0.push_back( K0_in + i*Ns );
    K1.push_back( K1_in + i*Ns );
    K2.push_back( K2_in + i*Ns );
    Lig.push_back( Ligand_in + i*Ns );
    Vol.push_back( Voltage_in + i*Ns );
    Pres.push_back( Pressure_in + i*Ns );
  }
  std::vector<double> segLL(Nseg);

  return subi_ll(Ns, clazz, P0_in, &(K0[0]), &(K1[0]), &(K2[0]), &(Lig[0]), &(Vol[0]), &(Pres[0]),
		 tdead_sec, Nseg, dwellCounts, classeses, durationses,
		 Nsig, Nplex, plexiclasses, Nstim, stimclasses, ll, custom_eigen, &(segLL[0]), NULL, NULL, NULL);
}


extern "C" MAXILL_API int subi_ll(
	int Ns, int *clazz, double *P0_in,  double **K0_in, double **K1_in, double **K2_in,
	int **Ligand_in, int **Voltage_in, int **Pressure_in,
	double tdead_sec, int Nseg, int *dwellCounts, int **classeses, float **durationses,
	int Nsig, int Nplex, int *plexiclasses, int Nstim, double *stimclasses,
	double *ll, eigen_func custom_eigen, double *segLL, callbk_reportfun report_cb, void *report_data, int *stop_flag)
{
  //msl_print_timestamp("---subi_ll begin");
  callbk_reportstream output(report_cb, report_data);

  // find max class in model and data -- make sure it's possible
  int maxcls = clazz[0];
  for (int i=1; i<Ns; ++i)
    if ( clazz[i] > maxcls )
      maxcls = clazz[i];
  
  int maxcls_data = -1;
  for (int i=0; i<Nplex; ++i) {
    if ( plexiclasses[2*i] > maxcls_data )
      maxcls_data = plexiclasses[2*i];
  }
  if ( maxcls_data > maxcls ) {
    output << "MSL: Not enough classes in the model." << endl;
    return -1;
  }
  
  // list the states that are in each class, and the states which are not:
  std::vector< std::vector<int> > clazz_states(maxcls+1);
  std::vector< std::vector<int> > clazz_oppos(maxcls+1);
  for (int i=0; i<Ns; ++i) {
    clazz_states[clazz[i]].push_back(i);
    for (int j=0; j<Ns; ++j)
      if ( clazz[j] != clazz[i] )
	clazz_oppos[clazz[i]].push_back(j);
  }

  // copy inputs into ublas objects
  matrix<double> K0;         matrix_from_arrays(K0, K0_in, Ns, Ns);
  matrix<double> K1;         matrix_from_arrays(K1, K1_in, Ns, Ns);
  matrix<double> K2;         matrix_from_arrays(K2, K2_in, Ns, Ns);
  matrix<int>    Ligand;     matrix_from_arrays(Ligand, Ligand_in, Ns, Ns);
  matrix<int>    Voltage;    matrix_from_arrays(Voltage, Voltage_in, Ns, Ns);
  matrix<int>    Pressure;   matrix_from_arrays(Pressure, Pressure_in, Ns, Ns);

  // set up P0, eQ, eigen-decomposition of submatrices eQaa, and A(tdead)
  // and temporary matrices for intermediate calculations
  
  // normalize fixed P0 (if any) to sum to 1
  matrix<double> P0(1, Ns);
  if ( P0_in ) 
  {
    matrix_from_array(P0, P0_in, 1, Ns);
    double sum = 0.0;
    for (int i=0; i<Ns; ++i)
      sum += P0_in[i];
    for (int i=0; i<Ns; ++i)
      P0(0,i) = (sum > 1e-8 ) ? (P0(0,i) / sum) : (1.0 / Ns);
  }
  int            Na;

  // build dead-time-corrected Q matrices, different in each stimclass
  std::vector< matrix<double> > QQ(Nstim), eQQ(Nstim);
  for (int sc=0; sc<Nstim; ++sc) {
    BuildQ_p(Ns, QQ[sc], K0, K1, K2, Ligand, Voltage, Pressure, stimclasses + sc*Nsig);
    QtoQe(QQ[sc], clazz, tdead_sec, eQQ[sc]);
  }
  
  // we eigendecompose each eQ for quick exponentiation in the inner loop: (gap A matrix)
  // indices are [stimclass][(vector or matrix)]
  std::vector< std::vector<double> > eigvals_eq(Nstim); // (maxcls+1);
  std::vector< matrix<double> > eigvecs_eq(Nstim);
  std::vector< matrix<double> > eiginvs_eq(Nstim);
  vector<double> eig_imag(Ns);
  for (int sc=0; sc<Nstim; ++sc) {
    if ( stop_flag && *stop_flag ) return -33;
    eigvals_eq[sc].resize(Ns);
    eigvecs_eq[sc].resize(Ns, Ns);
    eiginvs_eq[sc].resize(Ns, Ns);
    if ( msl_super_eigen(eQQ[sc], &(eigvals_eq[sc][0]), &(eig_imag[0]), eigvecs_eq[sc], eiginvs_eq[sc], custom_eigen, output) ) {
      output << "No convergence in eigen calculations for stimclass " << sc << endl;
      output << eQQ[sc] << endl;
      return -2;
    }
    for (int i=0; i<Ns; ++i) {
      if ( fabs(eig_imag[i]) > 1e-5 ) {
	output << "Complex eigenvalues found: " << eig_imag[i] << " for stimclass " << sc << endl;
	output << eQQ[sc] << endl;
	return -2;
      }
      if ( eigvals_eq[sc][i] > 1e-5 ) {
	output << "Positive eigenvalues found: " << eigvals_eq[sc][i] << " for stimclass " << sc << endl;
	output << eQQ[sc] << endl;
	return -2;
      }
    }
  }
  // we eigendecompose each submatrix eQaa for quick exponentiation in the inner loop:
  // indices are [stimclass][model class][(vector or matrix)]
  std::vector< std::vector< std::vector<double> > > eigvals_aa(Nstim); // (maxcls+1);
  std::vector< std::vector< matrix<double> > > eigvecs_aa(Nstim);
  std::vector< std::vector< matrix<double> > > eiginvs_aa(Nstim);
  for (int sc=0; sc<Nstim; ++sc) {
    if ( stop_flag && *stop_flag ) return -33;
    eigvals_aa[sc].resize(maxcls+1);
    eigvecs_aa[sc].resize(maxcls+1);
    eiginvs_aa[sc].resize(maxcls+1);
    for (int a=0; a<=maxcls; ++a) {
      Na = (int) clazz_states[a].size();
      if ( Na ) {
	int *aa = &(clazz_states[a][0]);
	matrix<double> eQaa;   matrix_to_partition(eQQ[sc], Na, Na, aa, aa, eQaa);
	eigvals_aa[sc][a].resize(Na);
	eigvecs_aa[sc][a].resize(Na, Na);
	eiginvs_aa[sc][a].resize(Na, Na);
	if ( msl_super_eigen(eQaa, &(eigvals_aa[sc][a][0]), &(eig_imag[0]), eigvecs_aa[sc][a], eiginvs_aa[sc][a], custom_eigen, output) ) {
	  output << "No convergence in eigen calculations for eQaa class " << a << endl;
	  output << eQaa << endl;
	  return -2;
	}
	for (int i=0; i<Na; ++i) {
	  if ( fabs(eig_imag[i]) > 1e-5 ) {
	    output << "Complex eigenvalues found: " << eig_imag[i] << " for eQaa class " << a << endl;
	    output << eQaa << endl;
	    return -2;
	  }
	  if ( eigvals_aa[sc][a][i] > 1e-5 ) {
	    output << "Positive eigenvalues found: " << eigvals_aa[sc][a][i] << " for eQaa class " << a << endl;
	    output << eQaa << endl;
	    return -2;
	  }
	}
      }
    }
  }
  // Then for efficiency, allocate space for expm_eigen temps
  std::vector< matrix<double> > tmp_nxn(Ns+1), tmp_nxn_diag(Ns+1);
  for (int i=1; i<=Ns; ++i) {
    tmp_nxn[i].resize(i, i);
    tmp_nxn_diag[i].resize(i, i);
    for (int j=0; j<i; ++j)
      for (int k=0; k<i; ++k)
	tmp_nxn_diag[i](j,k) = 0.0;
  }
  // and a place for the final exponentiated matrices:
  std::vector< matrix<double> > eQ_aa(maxcls+1);
  for (int a=0; a<=maxcls; ++a) {
    Na = (int) clazz_states[a].size();
    if ( Na )
      eQ_aa[a].resize(Na, Na);
  }
  // which get copied out to the full Ns x Ns
  matrix<double> eQ_aa_full(Ns, Ns);
  
  // each stimclass has Atd = exp(Q * tdead)
  std::vector< matrix<double> > AAtd(Nstim);
  for (int sc=0; sc<Nstim; ++sc) {
    if ( stop_flag && *stop_flag ) return -33;
	AAtd[sc].resize(Ns, Ns);
    expm(QQ[sc], tdead_sec, AAtd[sc]);
  }
  // for a given destination class a, we zero out columns belonging to other classes
  // (probability of ending outside a is 0, given the data)
  std::vector< std::vector< matrix<double> > > AAtd_to(Nstim); // [sc][a]
  for (int sc=0; sc<Nstim; ++sc) {
    matrix<double>& Atd = AAtd[sc];
    AAtd_to[sc].resize(maxcls+1);
    for (int a=0; a<=maxcls; ++a) {
      matrix<double>& Atd_a = AAtd_to[sc][a];
      std::vector<int>::iterator other = clazz_oppos[a].begin();
      std::vector<int>::iterator other_end = clazz_oppos[a].end();
      Atd_a.resize(Ns, Ns);
      for (int j=0; j<Ns; ++j) {
	if ( (other != other_end) && (*other == j) ) {
	  ++other;
	  for (int i=0; i<Ns; ++i)
	    Atd_a(i,j) = 0.0;
	}
	else {
	  for (int i=0; i<Ns; ++i)
	    Atd_a(i,j) = Atd(i,j);
	}
      }
    }
  }
  // scratch space for custom final Atd to "any class but a"
  matrix<double> Atd_final(Ns, Ns);

  // alternating temporary alpha rows per class to avoid temporaries alpha_a <- alpha_a * At
  vector<double> alpha_pre(Ns), alpha_post(Ns);

  // scale accumulator
  *ll = 0.0;
  
  // loop over segments, summing ll
  for (int iseg = 0; iseg<Nseg; ++iseg) {
    if ( stop_flag && *stop_flag ) return -33;
    int Nd = dwellCounts[iseg];
    int *classes = classeses[iseg];
    float *durations = durationses[iseg];
    int pc = classes[0];
    int a = plexiclasses[2*pc];
    int sc = plexiclasses[2*pc+1];
    if ( a >= 0 )
      Na = (int) clazz_states[a].size();
    else // gap
      Na = Ns;
    int b = a;
    int Nb = Na;
    double &seg_ll = segLL[iseg];
    seg_ll = 0.0;
    
    // If they didn't give fixed P0, start at equilibrium for the starting stimulus
    if ( ! P0_in ) {
      QtoPe(eQQ[sc], P0);
    }
    
    // P0 are the entry probabilities, only in states of class a
    if ( a >= 0 ) { 
      for (int i=0; i<Ns; ++i) {
	alpha_pre(i) = (clazz[i] == a) ? P0(0, i) : 0.0;
      }
    } else { // gap: all states possible
      for (int i=0; i<Ns; ++i) {
	alpha_pre(i) = P0(0, i);
      }
    }
    // their contribution to ll is log( sum( alpha_pre ) )
    // and alpha_pre is normalized to sum to 1
    seg_ll += alpha_logscale(alpha_pre);
    
    // loop over all but the final dwell
    for (int d=1; d<Nd; ++d) {
      a = b;                      // next dwell, in class a
      Na = Nb;
      
      // check the user stop flag every so often
      if ( (! (d%512)) && stop_flag && *stop_flag ) return -33;

      /*
in original MIL, scaling is done separately for the dwell and subsequent transition, and all alphas are kept for the gradient calculation. Here we accumulate and scale both together.
note that dur has already had tdead subtracted
alpha <- alpha \(* e^{Q_{aa} * dur} * A_{td}\)[b]
      */
      
      if ( a >= 0 ) {
	expm_eig(&(eigvals_aa[sc][a][0]), eigvecs_aa[sc][a], eiginvs_aa[sc][a], durations[d-1],
		 tmp_nxn_diag[Na], tmp_nxn[Na], eQ_aa[a]);
	eQ_aa_full.clear();
	matrix_from_partition(eQ_aa_full, Na, Na, &(clazz_states[a][0]), &(clazz_states[a][0]), eQ_aa[a]);
      } else { // gap: full exp(eQ)
	expm_eig(&(eigvals_eq[sc][0]), eigvecs_eq[sc], eiginvs_eq[sc], durations[d-1],
		 tmp_nxn_diag[Ns], tmp_nxn[Ns], eQ_aa_full);
      }
      noalias(alpha_post) = prod(alpha_pre, eQ_aa_full);
      // seg_ll += alpha_logscale(alpha_post);

      // switch sc after the dwell, before the transition
      pc = classes[d];
      b = plexiclasses[2*pc];    // going to class b
      sc = plexiclasses[2*pc+1];
      if ( b >= 0 )
	Nb = (int) clazz_states[b].size();
      else // into gap
	Nb = Ns;
      
      if ( b >= 0 )
	noalias(alpha_pre) = prod(alpha_post, AAtd_to[sc][b]);
      else // into gap: all states possible
	noalias(alpha_pre) = prod(alpha_post, AAtd[sc]);
      seg_ll += alpha_logscale(alpha_pre);
    }
    // final dwell, in class a
    a = b;
    Na = Nb;
    // dwelling, same as above
    if ( a >= 0 ) {
      expm_eig(&(eigvals_aa[sc][a][0]), eigvecs_aa[sc][a], eiginvs_aa[sc][a], durations[Nd-1],
	       tmp_nxn_diag[Na], tmp_nxn[Na], eQ_aa[a]);
      eQ_aa_full.clear();
      matrix_from_partition(eQ_aa_full, Na, Na, &(clazz_states[a][0]), &(clazz_states[a][0]), eQ_aa[a]);
    }
    else {
      expm_eig(&(eigvals_eq[sc][0]), eigvecs_eq[sc], eiginvs_eq[sc], durations[Nd-1],
	       tmp_nxn_diag[Ns], tmp_nxn[Ns], eQ_aa_full);
    }
    noalias(alpha_post) = prod(alpha_pre, eQ_aa_full);
    // seg_ll += alpha_logscale(alpha_post);
    
    // and exiting to any class but a
    matrix<double>& Atd = AAtd[sc];
    if ( a >= 0 ) {
      Atd_final.clear();
      std::vector<int>::iterator other = clazz_oppos[a].begin();
      std::vector<int>::iterator other_end = clazz_oppos[a].end();
      for (int j=0; j<Ns; ++j) {
	if ( (other != other_end) && (*other == j) ) {
	  ++other;
	  for (int i=0; i<Ns; ++i)
	    Atd_final(i,j) = Atd(i,j);
	}
      }
    } else { // gap: no blanks
      Atd_final = Atd;
    }
    noalias(alpha_pre) = prod(alpha_post, Atd_final);
    seg_ll += alpha_logscale(alpha_pre);
    
    // now the one segment ll is done, add it to total ll
    *ll += seg_ll;
  }

  //msl_print_timestamp("---subi_ll end");
  return 0; // success
}






template<class T>
void show_mem(const char *lbl, T *data, int Nr, int Nc, bool show_label=true)
{
  if ( show_label )
    cerr << lbl << ":\n";
  for (int i=0; i<Nr; ++i) {
    for (int j=0; j<Nc; ++j) {
      cerr << data[i*Nc+j] << "  ";
    }
    cerr << endl;
  }
}

template<class T>
void show_mem(const char *lbl, T *data, int Ns, int Nr, int Nc, bool show_label=true)
{
  if ( Ns )
    show_mem(lbl, data, Nr, Nc, show_label);
  for (int i=1; i<Ns; ++i) {
    cerr << endl;
    show_mem(lbl, data+i*(Nr*Nc), Nr, Nc, false);
  }
}

template<class T>
void show_mem(const char *lbl, T *data, int Nt, int Ns, int Nr, int Nc, bool show_label=true)
{
  if ( Nt )
    show_mem(lbl, data, Ns, Nr, Nc, show_label);
  for (int i=1; i<Nt; ++i) {
    cerr << "\n------- " << i << " --------\n\n";
    show_mem(lbl, data+i*(Ns*Nr*Nc), Ns, Nr, Nc, false);
  }
}

template<class T>
void show_mem(const char *lbl, T *data, int Nu, int Nt, int Ns, int Nr, int Nc, bool show_label=true)
{
  if ( Nu )
    show_mem(lbl, data, Nt, Ns, Nr, Nc, show_label);
  for (int i=1; i<Nu; ++i) {
    cerr << "\n======= " << i << " ========\n\n";
    show_mem(lbl, data+i*(Nt*Ns*Nr*Nc), Nt, Ns, Nr, Nc, false);
  }
}

#ifdef QUBX_MAC_OPENCL
  #include "max_subi_ll.opencl"
#endif
  
  #define SMALLEST_CHUNK 8
  #define QUBX_OCL_MAX_NSTATE 16
  #define QUBX_OCL_MIN_NDATA 256
  #define QUBX_4 0
  #define QUBX_8 1
  #define QUBX_16 2

class msl_accel_data;
class msl_accel_models;

class msl_accel_context
{
public:
  int accel;
#ifdef QUBX_MAC_OPENCL
  cl_context ocl_context;
  cl_device_id ocl_device;
  cl_uint ocl_compunits;
  cl_command_queue ocl_queue;
  cl_program ocl_program;
  cl_kernel ocl_kernel[3];
  map< int, map< int, map< int, double > > > ocl_share; // [Nstate][Nmodel][int(log10(Nevent))]: proportion in [0..1] of models to run on GPU
#endif
  
  msl_accel_context()
    : accel(0) 
#ifdef QUBX_MAC_OPENCL
    , ocl_context(NULL), ocl_device(0), ocl_compunits(0), ocl_queue(NULL), ocl_program(NULL) //, ocl_kernel({NULL, NULL, NULL})
#endif
  {
#ifdef QUBX_MAC_OPENCL
    for (int i=0; i<3; ++i)
      ocl_kernel[i] = NULL;
#endif
  }
  int initialize(int accel_) {
    accel = accel_;
    if ( accel_ == MSL_ACCEL_OPENCL ) {
#ifdef QUBX_MAC_OPENCL
      cl_platform_id plat_ids[3];
      cl_device_id dev_ids[3];
      cl_context_properties props[3];
      cl_uint n, ndesc;
      cl_platform_id p_max = 0;
      cl_device_id d_max = 0;
      cl_uint n_max = 0;
      int errcode = 0;
      
      if ( (errcode = clGetPlatformIDs(3, plat_ids, &n)) ) {
	cerr << "clGetPlatformIDs: error " << errcode << endl;
	return errcode;
      }
      
      for (cl_uint i = 0; i < n; i++) {
	if ( (errcode = clGetDeviceIDs(plat_ids[i], CL_DEVICE_TYPE_GPU, 3*sizeof(cl_device_id), dev_ids, &ndesc)) ) {
	  cerr << "clGetDeviceIDs(platform " << i << "): error " << errcode << endl;
	  return errcode;
	}
	for (size_t d=0; d<ndesc; ++d) {
	  if ( (errcode = clGetDeviceInfo(dev_ids[d], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &ocl_compunits, NULL)) ) {
	    cerr << "clGetDeviceInfo: error " << errcode << endl;
	    return errcode;
	  }
	  //cerr << "Platform " << i << ", device " << d << ": " << ocl_compunits << " comp units" << endl;
	  if ( ocl_compunits > n_max ) {
	    n_max = ocl_compunits;
	    d_max = dev_ids[d];
	    p_max = plat_ids[i];
	  }
	}
      }
      ocl_device = d_max;
      ocl_compunits = n_max;
      
      props[0] = CL_CONTEXT_PLATFORM;
      props[1] = (cl_context_properties)p_max;
      props[2] = 0;
      
      //char exts[1000];
      //clGetDeviceInfo(ocl_device, CL_DEVICE_EXTENSIONS, 1000, exts, NULL);
      //cerr << "Extensions: " << exts << endl;
      
      ocl_context = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateContextFromType: error " << errcode << endl;
	return errcode;
      }
      
      ocl_queue = clCreateCommandQueue(ocl_context, ocl_device, 0, &errcode);
      if ( errcode ) {
	cerr << "clCreateCommandQueue: error " << errcode << endl;
	return errcode;
      }
      
      ocl_program = clCreateProgramWithSource(ocl_context, 1, &s_msl_kernels, NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateProgramWithSource: error " << errcode << endl;
	return errcode;
      }
      
      errcode = clBuildProgram(ocl_program, 1, &ocl_device, 0, 0, 0);
      if ( errcode ) {
	cerr << "clBuildProgram: error " << errcode << endl;
	// doesn't work:
	char build_msg[1000];
	clGetProgramBuildInfo(ocl_program, ocl_device, CL_PROGRAM_BUILD_LOG, 1000, build_msg, 0);
	cerr << build_msg << endl;
	//
	return errcode;
      }
      
      ocl_kernel[QUBX_4] = clCreateKernel(ocl_program, "msl_kernel_4", &errcode);
      if ( errcode ) {
	cerr << "clCreateKernel(msl_kernel_4): error " << errcode << endl;
	return errcode;
      }
      ocl_kernel[QUBX_8] = clCreateKernel(ocl_program, "msl_kernel_8", &errcode);
      if ( errcode ) {
	cerr << "clCreateKernel(msl_kernel_8): error " << errcode << endl;
	return errcode;
      }
      ocl_kernel[QUBX_16] = clCreateKernel(ocl_program, "msl_kernel_16", &errcode);
      if ( errcode ) {
	cerr << "clCreateKernel(msl_kernel_16): error " << errcode << endl;
	return errcode;
      }
      return accel;
#else
      --accel;
#endif
    }
    return accel;
  }
  virtual ~msl_accel_context() {
#ifdef QUBX_MAC_OPENCL
    for (int i=0; i<3; ++i)
      if ( ocl_kernel[i] )
	clReleaseKernel(ocl_kernel[i]);
    if ( ocl_program )
      clReleaseProgram(ocl_program);
    if ( ocl_queue )
      clReleaseCommandQueue(ocl_queue);
    if ( ocl_context )
      clReleaseContext(ocl_context);
#endif
  }
};

template<class T>
size_t aligned_length(MAXILL_VAR_NOT_USED T* array, size_t unaligned) {
  size_t block = 128 / sizeof(T);
  return block * ((unaligned / block) + ((unaligned % block) ? 1 : 0));
}

#define STIMCLASS_ALIGNMENT 8


class msl_accel_data
{
public:
  short Nseg;
  int Nmodel, dim, Nstim, Nsig, dim_Nsc;
  msl_accel_context *ctx;
  std::vector<float> P0; // [model_index * (dim*Nstim) + sc * dim + state_index]
  std::vector<double> ll; // [model_index * Nseg + seg_index]
  std::vector<double> stimclasses;
  std::vector<int> Ndwells;
  std::vector<int> dwells;
  std::vector<int> seg_off;
  std::vector<int> initial_stimclasses; // to avoid unnecessary p0 calc
#ifdef QUBX_MAC_OPENCL
  cl_mem ocl_ndwells;
  cl_mem ocl_seg_off;
  cl_mem ocl_dwells;
  cl_mem ocl_P0;
  cl_mem ocl_ll;
  bool ocl_sent;
#endif

  msl_accel_data()
    : Nseg(0), Nmodel(0), dim(0), Nstim(0), Nsig(0), dim_Nsc(0), ctx(NULL)
#ifdef QUBX_MAC_OPENCL
    , ocl_ndwells(NULL), ocl_seg_off(NULL), ocl_dwells(NULL), ocl_P0(NULL), ocl_ll(NULL), ocl_sent(false)
#endif
  {}
  virtual ~msl_accel_data() {
    finalize();
  }
  int initialize(msl_accel_context *ctx_, int Nseg_, int *dwellCounts, int **classeses, float **durationses, int Nsig_, MAXILL_VAR_NOT_USED int Nplex, int *plexiclasses, int Nstim_, double *stimclasses_) {
    finalize();
    
    this->ctx = ctx_;
    this->Nseg = (short) Nseg_;
    Ndwells.resize(Nseg_);
    seg_off.resize(Nseg_);
    
    set<int> init_sc;

    int maxNdwells = 0;
    int seg_at = 0;
    for (short i=0; i<Nseg_; ++i) {
      Ndwells[i] = dwellCounts[i];
      if ( dwellCounts[i] > maxNdwells )
          maxNdwells = dwellCounts[i];
      seg_off[i] = seg_at;
      seg_at += 3* (int) aligned_length(durationses[0], dwellCounts[i]);
    }
    dwells.resize(seg_at);
    // use durationses directly; transform plex class to markov and stim classes
    for (short i=0; i<Nseg_; ++i) {
      memcpy(&dwells[seg_off[i]], durationses[i], dwellCounts[i]*sizeof(float));
      int dc_aligned = (int) aligned_length(&dwells[0], dwellCounts[i]);
      int *dd = & dwells[seg_off[i] + dc_aligned];
      int dc = dwellCounts[i];
      for (int j=0; j<dc; ++j, ++dd)
	*dd = plexiclasses[2*classeses[i][j]]; // markov class
      dd = & dwells[seg_off[i] + 2*dc_aligned];
      for (int j=0; j<dc; ++j, ++dd)
	*dd = plexiclasses[2*classeses[i][j]+1]; // stimclass
      init_sc.insert(dwells[seg_off[i] + 2*dc_aligned]);
    }
    for (set<int>::iterator ini=init_sc.begin(); ini!=init_sc.end(); ++ini)
      initial_stimclasses.push_back(*ini);
    
    this->stimclasses.resize(Nstim_*Nsig_);
    this->Nstim = Nstim_;
    this->Nsig = Nsig_;
    this->dim_Nsc = STIMCLASS_ALIGNMENT * ((Nstim / STIMCLASS_ALIGNMENT) + ((Nstim % STIMCLASS_ALIGNMENT) ? 1 : 0));

    memcpy(&this->stimclasses[0], stimclasses_, Nstim_*Nsig_*sizeof(double));

    return 0;
  }
  void finalize() {
    ocl_finalize();
  }
  void ocl_finalize() {
#ifdef QUBX_MAC_OPENCL
    if ( ocl_ndwells )
      clReleaseMemObject(ocl_ndwells);
    if ( ocl_seg_off )
      clReleaseMemObject(ocl_seg_off);
    if ( ocl_dwells )
      clReleaseMemObject(ocl_dwells);
    if ( ocl_P0 )
      clReleaseMemObject(ocl_P0);
    if ( ocl_ll )
      clReleaseMemObject(ocl_ll);
    ocl_ndwells = ocl_seg_off = ocl_dwells = ocl_P0 = ocl_ll = NULL;
    ocl_sent = false;
#endif
  }
  int initialize_models(int Nmodel_, int dim_) {
    if ( Nmodel_ > this->Nmodel || dim_ != this->dim ) {
      this->Nmodel = Nmodel_;
      this->dim = dim_;
      P0.resize(Nmodel_ * dim_Nsc * dim_);
      ll.resize(Nmodel_ * Nseg);
    }
    return 0;
  }
  int send_data(int ocl_Nmodel) {
#ifdef QUBX_MAC_OPENCL
    int errcode;
    if ( ctx->accel == MSL_ACCEL_OPENCL && dim <= 16 && ocl_Nmodel) {
      if ( ! ocl_sent ) {
	ocl_seg_off = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, Nseg*sizeof(int), NULL, &errcode);
	if ( errcode ) {
	  cerr << "clCreateBuffer(ocl_seg_off): error " << errcode << endl;
	  return errcode;
	}
	//show_mem("seg_off", & seg_off[0], 1, Nseg);
	if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_seg_off, CL_FALSE, 0,
					     Nseg*sizeof(int), & seg_off[0], 0, NULL, NULL)) ) {
	  cerr << "clEnqueueWriteBuffer(ocl_seg_off): error " << errcode << endl;
	  return errcode;
	}
	
	ocl_ndwells = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, Nseg*sizeof(int), NULL, &errcode);
	if ( errcode ) {
	  cerr << "clCreateBuffer(ocl_ndwells): error " << errcode << endl;
	  return errcode;
	}
	//show_mem("ndwells", dwellCounts, 1, Nseg);
	if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_ndwells, CL_FALSE, 0,
					     Nseg*sizeof(int), & Ndwells[0], 0, NULL, NULL)) ) {
	  cerr << "clEnqueueWriteBuffer(ocl_ndwells): error " << errcode << endl;
	  return errcode;
	}
	
	ocl_dwells = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, dwells.size()*sizeof(float), NULL, &errcode);
	if ( errcode ) {
	  cerr << "clCreateBuffer(ocl_dwells): error " << errcode << endl;
	  return errcode;
	}
	//show_mem("dwells", &dwells[0], dwells.size()/16, 16);
	if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_dwells, CL_FALSE, 0,
					     dwells.size()*sizeof(int), & dwells[0],
					     0, NULL, NULL)) ) {
	  cerr << "clEnqueueWriteBuffer(ocl_dwells:cls): error " << errcode << endl;
	  return errcode;
	}
	ocl_sent = true;
      }
      if ( ocl_P0 )
	clReleaseMemObject(ocl_P0);
      ocl_P0 = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, ocl_Nmodel*dim_Nsc*dim*sizeof(float), NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateBuffer(ocl_P0): error " << errcode << endl;
	return errcode;
      }
      //show_mem("P0", &P0[0], ocl_Nmodel, dim_Nsc, dim);
      if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_P0, CL_FALSE, 0,
					   ocl_Nmodel*dim_Nsc*dim*sizeof(float), &P0[0], 0, NULL, NULL)) ) {
	cerr << "clEnqueueWriteBuffer(ocl_P0): error " << errcode << endl;
	return errcode;
      }
      if ( ocl_ll )
	clReleaseMemObject(ocl_ll);
      ocl_ll = clCreateBuffer(ctx->ocl_context, CL_MEM_WRITE_ONLY, ocl_Nmodel*Nseg*sizeof(double), NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateBuffer(ocl_ll): error " << errcode << endl;
	return errcode;
      }
    }
#endif
    return 0;
  }
  int read_LL(int ocl_Nmodel) {
#ifdef QUBX_MAC_OPENCL
    if ( ctx->accel == MSL_ACCEL_OPENCL ) {
      int errcode;
      if ( (errcode = clEnqueueReadBuffer(ctx->ocl_queue, ocl_ll, CL_TRUE, 0,
					  ocl_Nmodel * Nseg * sizeof(double), & ll[0],
					  0, NULL, NULL)) ) {
	cerr << "clEnqueueReadBuffer(ocl_ll): error " << errcode << endl;
	return errcode;
      }
    }
#endif
    return 0;
  }
};

class msl_accel_models
{
public:
  int dim, Nsc;
  short Nmodel, dim_Nsc;
  msl_accel_context *ctx;
  std::vector<int> Ns;
  std::vector<int> clazz;
  std::vector<double> Adead;
  std::vector<double> UU, lambdas;
  std::vector<double> ll;
  std::vector< matrix<double> > eQQ; // [sc] of most-recently setup model, used for p0
#ifdef QUBX_MAC_OPENCL
  cl_mem ocl_Ns;
  cl_mem ocl_clazz;
  cl_mem ocl_Adead;
  cl_mem ocl_eQaa_U; // [Nmodel][dim_Nsc][dim+1:cls-1 | 0:full eQ][0: U | 1: Ui][dim*dim]
  cl_mem ocl_eQaa_lambda; // [Nmodel][dim_Nsc][dim+1:cls-1 | 0:full eQ][dim]
  int ocl_Nmodel;
#endif

  msl_accel_models()
    : dim(0), Nsc(0), Nmodel(0), dim_Nsc(0), ctx(NULL)
#ifdef QUBX_MAC_OPENCL
    , ocl_Ns(NULL), ocl_clazz(NULL), ocl_Adead(NULL), ocl_eQaa_U(NULL), ocl_eQaa_lambda(NULL), ocl_Nmodel(0)
#endif
  {}
  virtual ~msl_accel_models() {
    finalize();
  }
  int initialize(msl_accel_context *ctx_, int dim_, int Nmodel_, int Nsc_) {
    this->ctx = ctx_;
    this->dim = dim_;
    // this->Nmodel = Nmodel; using this->Nmodel as counter in initialize_model
    this->Nsc = Nsc_;
    this->dim_Nsc = (short) (STIMCLASS_ALIGNMENT * ((Nsc_ / STIMCLASS_ALIGNMENT) + ((Nsc_ % STIMCLASS_ALIGNMENT) ? 1 : 0)));
    Ns.resize(Nmodel_);
    clazz.resize(Nmodel_*dim_);
    Adead.resize(Nmodel_*dim_Nsc*dim_*dim_);
    UU.resize(Nmodel_*dim_Nsc*(dim+1)*2*dim*dim);
    lambdas.resize(Nmodel_*dim_Nsc*(dim+1)*dim);
    ll.resize(Nmodel_);
#ifdef QUBX_MAC_OPENCL
    if ( ctx->accel == MSL_ACCEL_OPENCL ) {
    }
#endif
    return 0;
  }
  void reset() {
    Nmodel = 0;
#ifdef QUBX_MAC_OPENCL
    ocl_Nmodel = 0;
#endif
  }
  int setup_model(int Ns_, int *clazz_, matrix<double>& K0, matrix<double>& K1, matrix<double>& K2,
		  matrix<int>& Ligand, matrix<int>& Voltage, matrix<int>& Pressure, double tdead_sec,
		  int Nstim, int Nsig, double *stimclasses,
		  eigen_func custom_eigen, callbk_reportfun report_cb, void *report_data) {
    callbk_reportstream output(report_cb, report_data);
    
    Ns[Nmodel] = Ns_;
    memcpy(& clazz[dim*Nmodel], clazz_, Ns_*sizeof(int));
    for (int i=Ns_; i<dim; ++i)
      clazz[dim*Nmodel+i] = -10;
    // list the states that are in each class, and the states which are not:
    std::vector< std::vector<int> > clazz_states(dim);
    std::vector< std::vector<int> > clazz_oppos(dim);
    int maxcls = clazz_[0];
    for (int i=0; i<Ns_; ++i) {
      clazz_states[clazz_[i]].push_back(i);
      for (int j=0; j<Ns_; ++j)
	if ( clazz_[j] != clazz_[i] )
	  clazz_oppos[clazz_[i]].push_back(j);
      if ( clazz_[i] > maxcls )
	maxcls = clazz_[i];
    }
      
    // set up P0, eQ, eigen-decomposition of submatrices eQaa, and A(tdead)
    // and temporary matrices for intermediate calculations
    
    // build dead-time-corrected Q matrices, different in each stimclass
    std::vector< matrix<double> > QQ(Nstim);
    eQQ.resize(Nstim);
    
    std::vector<double> eigvals_eq(Ns_);
    matrix<double> eigvecs_eq(Ns_, Ns_);
    matrix<double> eiginvs_eq(Ns_, Ns_);
    vector<double> eig_imag(Ns_);
    int errs_eigen = 0;
    int warnings_eigen = 0;

    // we eigendecompose each eQ for quick exponentiation in the inner loop: (gap A matrix)
    std::vector< matrix<double> > eigvecs_aa(Ns_+1), eiginvs_aa(Ns_+1);
    for (int i=1; i<Ns_; ++i) {
      eigvecs_aa[i].resize(i,i);
      eiginvs_aa[i].resize(i,i);
    }
  
    // each stimclass has Atd = exp(Q * tdead)
  //#pragma omp parallel for reduction (+: errs_eigen) // no can do: all the eigen-solvers are non-thread-safe
    for (int sc=0; sc<Nstim; ++sc) {
      BuildQ_p(Ns_, QQ[sc], K0, K1, K2, Ligand, Voltage, Pressure, stimclasses + sc*Nsig);
      QtoQe(QQ[sc], clazz_, tdead_sec, eQQ[sc]);
      
      if ( msl_super_eigen(eQQ[sc], &(eigvals_eq[0]), &(eig_imag[0]), eigvecs_eq, eiginvs_eq, custom_eigen, output) ) {
	//cerr << "in sc " << sc << endl;
	++errs_eigen;
      }
      for (int i=0; i<Ns_; ++i) {
	if ( fabs(eig_imag[i]) > 1e-1 ) { // > fabs(eigvals_eq[i]) ) {
	  //cerr << "Warning: complex eigenvalue: " << eigvals_eq[i] << " + " << eig_imag[i] << " i" << endl;
	  ++warnings_eigen;
	}
	if ( eigvals_eq[i] > 1e-1 ) {
	  //cerr << "Warning: positive eigenvalue: " << eigvals_eq[i] << endl;
	  ++warnings_eigen;
	}
      }
      //memcpy(& lambdas[sc*(dim+1)*dim], & eigvals_eq[0], Ns_*sizeof(double));
      double *lambdi = & lambdas[Nmodel*dim_Nsc*(dim+1)*dim + sc*(dim+1)*dim + Ns_ - 1];
      for (int i=Ns_-1; i>=0; --i, --lambdi)
	*lambdi = eigvals_eq[i];
      matrix_to_array(eigvecs_eq, & UU[Nmodel*dim_Nsc*(dim+1)*2*dim*dim + sc*(dim+1)*2*dim*dim], dim, dim);
      matrix_to_array(eiginvs_eq, & UU[Nmodel*dim_Nsc*(dim+1)*2*dim*dim + sc*(dim+1)*2*dim*dim + dim*dim], dim, dim);
      
      matrix<double> Atd(Ns_, Ns_), tmpm1(Ns_, Ns_), tmpm2(Ns_, Ns_);
      tmpm1.clear();
      tmpm2.clear();
      expm_eig(& eigvals_eq[0], eigvecs_eq, eiginvs_eq, tdead_sec, tmpm1, tmpm2, Atd);
      //cerr << "eQQ: " << eQQ[sc] << endl;
      //cerr << eigvecs_eq << endl;
      //cerr << eiginvs_eq << endl;
      //show_mem("eigvals:", & eigvals_eq[0], 1, Ns_);
      //cerr << "Atd: " << Atd << endl;
      matrix_to_array(Atd, & Adead[Nmodel*dim_Nsc*dim*dim + sc*dim*dim], dim, dim);
      
      // we eigendecompose each submatrix eQaa for quick exponentiation in the inner loop:
      for (int a=0; a<=maxcls; ++a) {
	int Na = (int) clazz_states[a].size();
	if ( Na ) {
	  int *aa = &(clazz_states[a][0]);
	  matrix<double> eQaa;   matrix_to_partition(eQQ[sc], Na, Na, aa, aa, eQaa);
	  if ( msl_super_eigen(eQaa, &(eigvals_eq[0]), &(eig_imag[0]), eigvecs_aa[Na], eiginvs_aa[Na], custom_eigen, output) ) {
	    //cerr << "in sc " << sc << ", class " << a << endl;
	    ++errs_eigen;
	  }
	  for (int i=0; i<Na; ++i) {
	    if ( fabs(eig_imag[i]) > 1e-1 ) { // > fabs(eigvals_eq[i]) ) {
	      //cerr << "Warning: complex eigenvalue: " << eigvals_eq[i] << " + " << eig_imag[i] << " i" << endl;
	      ++warnings_eigen;
	    }
	    if ( eigvals_eq[i] > 1e-1 ) {
	      //cerr << "Warning: positive eigenvalue: " << eigvals_eq[i] << endl;
	      ++warnings_eigen;
	    }
	  }
	  //memcpy(& lambdas[sc*(dim+1)*dim + (a+1)*dim], & eigvals_eq[0], Ns_*sizeof(double));
	  lambdi = & lambdas[Nmodel*dim_Nsc*(dim+1)*dim + sc*(dim+1)*dim + (a+1)*dim + Ns_ - 1];
	  for (int i=Ns_-1; i>=0; --i, --lambdi)
	    *lambdi = eigvals_eq[i];
	  matrix_to_array(eigvecs_aa[Na], & UU[Nmodel*dim_Nsc*(dim+1)*2*dim*dim + sc*(dim+1)*2*dim*dim + (a+1)*2*dim*dim], dim, dim);
	  matrix_to_array(eiginvs_aa[Na], & UU[Nmodel*dim_Nsc*(dim+1)*2*dim*dim + sc*(dim+1)*2*dim*dim + (a+1)*2*dim*dim + dim*dim], dim, dim);
	}
      }
    }
    
    if ( errs_eigen || warnings_eigen ) {
      output << "Eigen errors." << endl;
      return -10;
    }
    //else if ( warnings_eigen ) {
    //  output << "Warning: positive and/or complex eigenvalues detected." << endl;
    //}
    
    //show_mem("Adead", &Adead[Nmodel*dim_Nsc*dim*dim], dim_Nsc, dim, dim);
    //show_mem("UU", &UU[Nmodel*dim_Nsc*(dim+1)*2*dim*dim], dim_Nsc, 2*(dim+1), dim, dim);
    //show_mem("lambdas", & lambdas[Nmodel*dim_Nsc*(dim+1)*dim], dim_Nsc*(dim+1), dim);

    ++Nmodel;
    return 0;
  }
  int send_models(int count) {
    // batch-sending
#ifdef QUBX_MAC_OPENCL
    if ( ctx->accel == MSL_ACCEL_OPENCL && dim <= 16 && ocl_Nmodel < count ) {
      int errcode = 0;
      ocl_finalize();
      ocl_Ns = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, count*sizeof(int), NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateBuffer(ocl_Ns): error " << errcode << endl;
	return errcode;
      }
      ocl_clazz = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, count*dim*sizeof(int), NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateBuffer(ocl_clazz): error " << errcode << endl;
	return errcode;
      }
      ocl_Adead = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, count*dim_Nsc*dim*dim*sizeof(double), NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateBuffer(ocl_Adead): error " << errcode << endl;
	return errcode;
      }
      ocl_eQaa_U = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, count*dim_Nsc*(dim+1)*2*dim*dim*sizeof(double), NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateBuffer(ocl_eQaa_U): error " << errcode << endl;
	return errcode;
      }
      ocl_eQaa_lambda = clCreateBuffer(ctx->ocl_context, CL_MEM_READ_ONLY, count*dim_Nsc*(dim+1)*dim*sizeof(double), NULL, &errcode);
      if ( errcode ) {
	cerr << "clCreateBuffer(ocl_eQaa_lambda): error " << errcode << endl;
	return errcode;
      }
      //show_mem("Ns", &Ns[0], 1, count);
      if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_Ns, CL_FALSE, 0,
					   count*sizeof(int), & Ns[0],
					   0, NULL, NULL)) ) {
	cerr << "clEnqueueWriteBuffer(ocl_Ns): error " << errcode << endl;
	return errcode;
      }
      //show_mem("clazz", &clazz[0], count, dim);
      if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_clazz, CL_FALSE, 0,
					   count*dim*sizeof(int), & clazz[0],
					   0, NULL, NULL)) ) {
	cerr << "clEnqueueWriteBuffer(ocl_clazz): error " << errcode << endl;
	return errcode;
      }
      //show_mem("Adead", &Adead[0], count, dim_Nsc, dim, dim);
      if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_Adead, CL_FALSE, 0,
					   count*dim_Nsc*dim*dim*sizeof(double), & Adead[0], 0, NULL, NULL)) ) {
	cerr << "clEnqueueWriteBuffer(ocl_Adead): error " << errcode << endl;
	return errcode;
      }
      
      //show_mem("eQaa_U", & UU[0], count*dim_Nsc, dim+1, 2, dim, dim);
      if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_eQaa_U, CL_FALSE, 0,
					   count*dim_Nsc*(dim+1)*2*dim*dim*sizeof(double), & UU[0], 0, NULL, NULL)) ) {
	cerr << "clEnqueueWriteBuffer(ocl_eQaa_U): error " << errcode << endl;
	return errcode;
      }
      //show_mem("eQaa_lambda", &lambdas[0], count, dim_Nsc, dim+1, dim);
      if ( (errcode = clEnqueueWriteBuffer(ctx->ocl_queue, ocl_eQaa_lambda, CL_FALSE, 0,
					   count*dim_Nsc*(dim+1)*dim*sizeof(double), & lambdas[0], 0, NULL, NULL)) ) {
	cerr << "clEnqueueWriteBuffer(ocl_eQaa_lambda): error " << errcode << endl;
	return errcode;
      }
      ocl_Nmodel = count;
    }
#endif
    return 0;
  }
  void finalize() {
    ocl_finalize();
    Nmodel = 0;
  }
  void ocl_finalize() {
#ifdef QUBX_MAC_OPENCL
    if ( ocl_Ns )
      clReleaseMemObject(ocl_Ns);
    if ( ocl_clazz )
      clReleaseMemObject(ocl_clazz);
    if ( ocl_Adead )
      clReleaseMemObject(ocl_Adead);
    if ( ocl_eQaa_U )
      clReleaseMemObject(ocl_eQaa_U);
    if ( ocl_eQaa_lambda )
      clReleaseMemObject(ocl_eQaa_lambda);
    ocl_Ns = ocl_clazz = ocl_Adead = ocl_eQaa_U = ocl_eQaa_lambda = NULL;
    ocl_Nmodel = 0;
#endif
  }
};

double msl_accel_scale(matrix<double>& p)
{
  double sum = 0.0;
  for (int i=((int) p.size2())-1; i>=0; --i)
    sum += p(0,i);
  if ( sum )
    sum = 1.0 / sum;
  for (int i=((int) p.size2())-1; i>=0; --i)
    p(0,i) *= sum;
  //cerr << "scale: " << sum << " -> " << log(sum) << endl;
  return sum;
}

void msl_expm_eig(double *rvalues, double *vectors, double *vectors_inv, int dim, double t, matrix<double>& Q_out)
{
  // Q = vectors * diag(exp(rvalues * t)) * vectors_inv
  // Q = R * vectors_inv
  // Q_ij = sum_k(R_ik * VI_kj)
  // R_ik = V_ik * D_kk
  // Q_ij = sum_k(V_ik * exp(rvalues[k] * t) * VI_kj)
  int n = (int) Q_out.size1();
  for (int i=n-1; i>=0; --i) {
    for (int j=n-1; j>=0; --j) {
      double sum = 0.0;
      for (int k=n-1; k>=0; --k)
	sum += vectors[i*dim+k] * exp(rvalues[k] * t) * vectors_inv[k*dim+j];
      Q_out(i,j) = sum;
    }
  }
}

void msl_expm_eig_in_class(double *rvalues, double *vectors, double *vectors_inv, int dim, double t, matrix<double>& Q_out,
			   MAXILL_VAR_NOT_USED matrix<int>& states_of_class, std::vector<int>& Nstate_of_class, int c)
{
  // Q = vectors * diag(exp(rvalues * t)) * vectors_inv
  // Q = R * vectors_inv
  // Q_ij = sum_k(R_ik * VI_kj)
  // R_ik = V_ik * D_kk
  // Q_ij = sum_k(V_ik * exp(rvalues[k] * t) * VI_kj)
  int n = Nstate_of_class[c];
  for (int i=n-1; i>=0; --i) {
    for (int j=n-1; j>=0; --j) {
      double sum = 0.0;
      for (int k=n-1; k>=0; --k) {
	sum += vectors[i*dim+k] * exp(rvalues[k] * t) * vectors_inv[k*dim+j];
      }
      Q_out(i,j) = sum;
    }
  }
}

void msl_mul_Adead(matrix<double>& p1, matrix<double>& p0, double *Adead, int dim, int c0, matrix<int>& states_of_class)
{
  // p1 = p0 * Adead
  // p1(0,j) = sum_k(p0(0,k) * Adead(k,j))
  int n1 = (int) p1.size2();
  int n0 = (int) p0.size2();
  for (int j=n1-1; j>=0; --j) {
    double sum = 0.0;
    double *AA = Adead + j;
    if ( c0 >= 0 ) {
      for (int k=0; k<n0; ++k, AA += dim) {
	sum += p0(0,k) * AA[states_of_class(c0,k)*dim];
      }
    } else {
      for (int k=0; k<n0; ++k, AA += dim) {
	sum += p0(0,k) * AA[k*dim];
      }
    }
    p1(0,j) = sum;
  }
}

void msl_mul_Adead_to_class(matrix<double>& p1, matrix<double>& p0, double *Adead, int dim, MAXILL_VAR_NOT_USED int *clazz, int c1, int c0, matrix<int>& states_of_class)
{
  int n1 = (int) p1.size2();
  int n0 = (int) p0.size2();
  for (int j=n1-1; j>=0; --j) {
    double sum = 0.0;
    double *AA = Adead + states_of_class(c1,j);
    for (int k=0; k<n0; ++k, AA += dim) {
      int ks = (c0 >= 0) ? states_of_class(c0, k) : k;
      sum += p0(0,k) * AA[ks*dim]; // AA[k*dim + j]
    }
    p1(0,j) = sum;
  }
}

void msl_mul_Adead_from_class(matrix<double>& p1, matrix<double>& p0, double *Adead, int dim, int *clazz, int c, matrix<int>& states_of_class)
{
  int n = (int) p1.size2();
  int n0 = (int) p0.size2();
  for (int j=n-1; j>=0; --j) {
    if ( c == clazz[j] ) {
      p1(0,j) = 0.0;
    } else {
      double sum = 0.0;
      double *AA = Adead + j;
      for (int k=0; k<n0; ++k, AA += dim) {
	sum += p0(0,k) * AA[states_of_class(c,k)*dim];
      }
      p1(0,j) = sum;
    }
  }
}

int msl_openmp_kernel(MAXILL_VAR_NOT_USED msl_accel_context *ctx, msl_accel_data *data, msl_accel_models *models, int first_iModel, int model_count)
{
  int Nwork = model_count * data->Nseg;
  int dim = models->dim;
  int dimdim = dim * dim;

  //msl_print_timestamp("---msl_openmp_kernel begin");

  // openmp private:
  int last_iModel = -1;
  matrix<int> states_of_class;
  std::vector<int> Nstate_of_class;
  std::vector< matrix<double> > p0, p1;
  std::vector< matrix<double> > eQ_aa_Na;

#pragma omp parallel for private(states_of_class, Nstate_of_class, p0, p1, eQ_aa_Na) firstprivate(last_iModel)
  for (int iWork = 0; iWork < Nwork; ++iWork) {
    int iSeg = iWork % data->Nseg;
    int iModel = first_iModel + iWork / data->Nseg;
    int *clazz = & models->clazz[dim*iModel];
    double *QQ_U = & models->UU[iModel * data->dim_Nsc * (dim+1) * 2 * dimdim];
    double *QQ_lambda = & models->lambdas[iModel * data->dim_Nsc * (dim+1) * dim];
    double *Adead = & models->Adead[iModel * data->dim_Nsc * dimdim];
    int Nstate = models->Ns[iModel];
    int Ndwell = data->Ndwells[iSeg];
    int dc_aligned = (int) aligned_length(&data->dwells[0], Ndwell);
    float *tdwt = (float *) & data->dwells[data->seg_off[iSeg]];
    int *idwt = & data->dwells[data->seg_off[iSeg] + dc_aligned];
    int *sdwt = idwt + dc_aligned;
    
    if ( iModel != last_iModel ) {
      states_of_class.resize(dim, dim);
      Nstate_of_class.resize(dim);
      p0.resize(Nstate+1);
      p1.resize(Nstate+1);
      eQ_aa_Na.resize(Nstate+1);
      for (int j=Nstate; j>=0; --j) {
	p0[j].resize(1, j);
	p1[j].resize(1, j);
	eQ_aa_Na[j].resize(j, j);
      }
      for (int j=dim-1; j>=0; --j) {
	int Ns_c = 0;
	for (int k=0; k<Nstate; ++k)
	  if ( clazz[k] == j )
	    states_of_class(j, Ns_c++) = k;
	Nstate_of_class[j] = Ns_c;
      }
      last_iModel = iModel;
    }
    
    int c = idwt[0];
    int Nc = (c >= 0) ? Nstate_of_class[c] : Nstate;
    int sc = sdwt[0];
    float dur = tdwt[0];
    for (int j=Nc-1; j>=0; --j)
      p0[Nc](0,j) = ((c >= 0) && (clazz[states_of_class(c,j)] != c)) ? 0.0 : data->P0[iModel*data->dim_Nsc*dim + sc*dim + states_of_class(c, j)];
    if ( ! msl_accel_scale(p0[Nc]) )
      for (int j=Nc-1; j>=0; --j)
	p0[Nc](0,j) = 1.0 / Nc;
    double ll = 0.0;
    for (int d=1; d<Ndwell; ++d) {
      //cerr << (d-1) << ": " << p0[Nc] << " @" << c << " (" << sc << " | " << Nc << "/" << Nstate << ") for " << dur << std::endl;
      if ( c >= 0 )
	msl_expm_eig_in_class(QQ_lambda+(sc*(dim+1) + (c+1))*dim, QQ_U+(sc*(dim+1) + (c+1))*2*dimdim,
			      QQ_U+(sc*(dim+1) + (c+1))*2*dimdim + dimdim,
			      dim, dur, eQ_aa_Na[Nc], states_of_class, Nstate_of_class, c);
      else
	msl_expm_eig(QQ_lambda+sc*(dim+1)*dim, QQ_U+sc*(dim+1)*2*dimdim, QQ_U+(sc*(dim+1)*2 + 1)*dimdim,
		     dim, dur, eQ_aa_Na[Nc]);
      noalias(p1[Nc]) = prod(p0[Nc], eQ_aa_Na[Nc]);
      //cerr << "p0[" << Nc << "] = " << p0[Nc] << endl;
      //cerr << "eA_aa_Na[" << Nc << "] = " << eQ_aa_Na[Nc] << endl;
      //cerr << "p1[" << Nc << "] = " << p1[Nc] << endl;
      
      int c_to = idwt[d];
      int Nc_to = (c_to >= 0) ? Nstate_of_class[c_to] : Nstate;
      if ( c_to >= 0 )
	msl_mul_Adead_to_class(p0[Nc_to], p1[Nc], Adead+sc*dimdim, dim, clazz, c_to, c, states_of_class);
      else
	msl_mul_Adead(p0[Nc_to], p1[Nc], Adead+sc*dimdim, dim, c, states_of_class);
      //show_mem("Adead:", Adead+sc*dimdim, dim, dim);
      //cerr << "p0[" << Nc_to << "] = " << p0[Nc_to] << endl;
      ll += log(msl_accel_scale(p0[Nc_to]));
      //if ( ! finite(ll) )
      //  cerr << d << ": (" << c << ", " << dur << "): from " << p1[Nc] << " by " << endl << eQ_aa_Na[Nc] << endl;
      sc = sdwt[d];
      dur = tdwt[d];
      c = c_to;
      Nc = Nc_to;
    }
    if ( c >= 0 )
      msl_expm_eig_in_class(QQ_lambda+(sc*(dim+1) + (c+1))*dim, QQ_U+(sc*(dim+1) + (c+1))*2*dimdim,
			    QQ_U+(sc*(dim+1) + (c+1))*2*dimdim + dimdim,
			    dim, dur, eQ_aa_Na[Nc], states_of_class, Nstate_of_class, c);
    else
      msl_expm_eig(QQ_lambda+sc*(dim+1)*dim, QQ_U+sc*(dim+1)*2*dimdim, QQ_U+(sc*(dim+1)*2 + 1)*dimdim,
		   dim, dur, eQ_aa_Na[Nc]);
    noalias(p1[Nc]) = prod(p0[Nc], eQ_aa_Na[Nc]);
    //cerr << "eA_aa_Na[" << Nc << "] = " << eQ_aa_Na[Nc] << endl;
    //cerr << "p1[" << Nc << "] = " << p1[Nc] << endl;
    if ( c >= 0 )
      msl_mul_Adead_from_class(p0[Nstate], p1[Nc], Adead+sc*dimdim, dim, clazz, c, states_of_class);
    else
      msl_mul_Adead(p0[Nstate], p1[Nc], Adead+sc*dimdim, dim, c, states_of_class);
    //show_mem("Adead:", Adead+sc*dimdim, dim, dim);
    //cerr << "p0[" << Nstate << "] = " << p0[Nstate] << endl;
    ll += log(msl_accel_scale(p0[Nstate]));
    //if ( ! finite(ll) )
    //  cerr << "last: (" << c << ", " << dur << "): from " << p1[Nc] << endl;
    data->ll[iModel*data->Nseg + iSeg] = - ll;
  }

  //msl_print_timestamp("---msl_openmp_kernel end");

  return 0;
}


int mil_openmp_kernel(MAXILL_VAR_NOT_USED msl_accel_context *ctx, msl_accel_data *data, msl_accel_models *models, int first_iModel, int model_count)
{
  int Nwork = model_count * data->Nseg;
  int dim = models->dim;
  int dimdim = dim * dim;

  //msl_print_timestamp("---mil_openmp_kernel begin");

  // openmp private:
  int last_iModel = -1;
  matrix<int> states_of_class;
  std::vector<int> Nstate_of_class;
  std::vector< matrix<double> > p0, p1;
  std::vector< matrix<double> > eQ_aa_Na;
  std::vector< std::vector< matrix<double> > > eQab;

#pragma omp parallel for private(states_of_class, Nstate_of_class, p0, p1, eQ_aa_Na, eQab) firstprivate(last_iModel)
  for (int iWork = 0; iWork < Nwork; ++iWork) {
    int iSeg = iWork % data->Nseg;
    int iModel = first_iModel + iWork / data->Nseg;
    int *clazz = & models->clazz[dim*iModel];
    double *QQ_U = & models->UU[iModel * data->dim_Nsc * (dim+1) * 2 * dimdim];
    double *QQ_lambda = & models->lambdas[iModel * data->dim_Nsc * (dim+1) * dim];
    int Nstate = models->Ns[iModel];
    int Ndwell = data->Ndwells[iSeg];
    int dc_aligned = (int) aligned_length(&data->dwells[0], Ndwell);
    float *tdwt = (float *) & data->dwells[data->seg_off[iSeg]];
    int *idwt = & data->dwells[data->seg_off[iSeg] + dc_aligned];
    int *sdwt = idwt + dc_aligned;
    
    if ( iModel != last_iModel ) {
      states_of_class.resize(dim, dim);
      Nstate_of_class.resize(dim);
      p0.resize(Nstate+1);
      p1.resize(Nstate+1);
      eQ_aa_Na.resize(Nstate+1);
      for (int j=Nstate; j>=0; --j) {
	p0[j].resize(1, j);
	p1[j].resize(1, j);
	eQ_aa_Na[j].resize(j, j);
      }

      for (int j=dim-1; j>=0; --j) {
	int Ns_c = 0;
	for (int k=0; k<Nstate; ++k)
	  if ( clazz[k] == j )
	    states_of_class(j, Ns_c++) = k;
	Nstate_of_class[j] = Ns_c;
      }

      int maxcls = 0;
      for (int a=0; a<Nstate; ++a)
	if ( maxcls < clazz[a] )
	  maxcls = clazz[a];
      eQab.resize(maxcls+1);
      for (int a=0; a<=maxcls; ++a) {
	eQab[a].resize(maxcls+1);
	int Na = Nstate_of_class[a];
	if ( Na ) {
	  for (int b=0; b<=maxcls; ++b) {
	    int Nb = Nstate_of_class[b];
	    if ( (b != a) && Nb ) {
	      matrix<double>& subQ = eQab[a][b];
	      subQ.resize(Na, Nb);
	      matrix_to_partition(models->eQQ[0], Na, Nb, &(states_of_class(a,0)), &(states_of_class(b,0)), subQ);
	    }
	  }
	}
      }
     
      last_iModel = iModel;
    }
    
    int c = idwt[0];
    int Nc = Nstate_of_class[c];
    int sc = sdwt[0];
    float dur = tdwt[0];
    for (int j=Nc-1; j>=0; --j)
      p0[Nc](0,j) = ((c >= 0) && (clazz[states_of_class(c,j)] != c)) ? 0.0 : data->P0[iModel*data->dim_Nsc*dim + sc*dim + states_of_class(c, j)];
    msl_accel_scale(p0[Nc]);
    //cerr << "P0: " << p0[Nc] << endl;
    if ( ! msl_accel_scale(p0[Nc]) )
      for (int j=Nc-1; j>=0; --j)
	p0[Nc](0,j) = 1.0 / Nc;
    double ll = 0.0;
    for (int d=1; d<Ndwell; ++d) {
      msl_expm_eig_in_class(QQ_lambda+(sc*(dim+1) + (c+1))*dim, QQ_U+(sc*(dim+1) + (c+1))*2*dimdim,
			    QQ_U+(sc*(dim+1) + (c+1))*2*dimdim + dimdim,
			    dim, dur, eQ_aa_Na[Nc], states_of_class, Nstate_of_class, c);
      noalias(p1[Nc]) = prod(p0[Nc], eQ_aa_Na[Nc]);
      //cerr << "p0[" << Nc << "] = " << p0[Nc] << endl;
      //cerr << "eA_aa_Na[" << Nc << "] = " << eQ_aa_Na[Nc] << endl;
      //cerr << "p1[" << Nc << "] = " << p1[Nc] << endl;
      ll += log(msl_accel_scale(p1[Nc]));
      
      int c_to = idwt[d];
      int Nc_to = Nstate_of_class[c_to];
      noalias(p0[Nc_to]) = prod(p1[Nc], eQab[c][c_to]);
      //cerr << "p0[" << Nc_to << "] = " << p0[Nc_to] << endl;
      ll += log(msl_accel_scale(p0[Nc_to]));
      sc = sdwt[d];
      dur = tdwt[d];
      c = c_to;
      Nc = Nc_to;
    }
    msl_expm_eig_in_class(QQ_lambda+(sc*(dim+1) + (c+1))*dim, QQ_U+(sc*(dim+1) + (c+1))*2*dimdim,
			  QQ_U+(sc*(dim+1) + (c+1))*2*dimdim + dimdim,
			  dim, dur, eQ_aa_Na[Nc], states_of_class, Nstate_of_class, c);
    noalias(p1[Nc]) = prod(p0[Nc], eQ_aa_Na[Nc]);
    //cerr << "eA_aa_Na[" << Nc << "] = " << eQ_aa_Na[Nc] << endl;
    //cerr << "p1[" << Nc << "] = " << p1[Nc] << endl;
    ll += log(msl_accel_scale(p1[Nc]));
    matrix<double> eQaz(Nc, Nstate);
    for (int a=0; a<Nc; ++a)
      for (int b=0; b<Nstate; ++b)
	eQaz(a,b) = (clazz[b] != c) ? models->eQQ[0](states_of_class(c,a), b) : 0.0;
    noalias(p0[Nstate]) = prod(p1[Nc], eQaz);
    //cerr << eQaz << endl;
    //cerr << "p0[" << Nstate << "] = " << p0[Nstate] << endl;
    ll += log(msl_accel_scale(p0[Nstate]));
    data->ll[iModel*data->Nseg + iSeg] = - ll;
  }

  //msl_print_timestamp("---mil_openmp_kernel end");

  return 0;
}



extern "C" MAXILL_API void* msl_accel_context_init(int *accel)
{
  msl_accel_context *ctx = new msl_accel_context;
  while ( *accel >= 0 ) {
    if ( (*accel = ctx->initialize(*accel)) < 0 )
      delete ctx;
    else
      return ctx;
    cerr << "msl_accel_context_init failed at level " << *accel << endl;
    --(*accel);
  }
  return NULL;
}

extern "C" MAXILL_API void msl_accel_context_free(void *ctx_)
{
  msl_accel_context *ctx = (msl_accel_context *) ctx_;
  delete ctx;
}

extern "C" MAXILL_API void* msl_accel_data_init(void *ctx_, int Nseg, int *dwellCounts, int **classeses, float **durationses, int Nsig, int Nplex, int *plexiclass, int Nstim, double *stimclasses)
{
  msl_accel_context *ctx = (msl_accel_context *) ctx_;
  msl_accel_data *data = new msl_accel_data;
  //msl_print_timestamp("---data_init");
  if ( data->initialize(ctx, Nseg, dwellCounts, classeses, durationses, Nsig, Nplex, plexiclass, Nstim, stimclasses) ) {
    delete data;
    return NULL;
  }
  //msl_print_timestamp("---/data_init");
  return data;
}

extern "C" MAXILL_API void msl_accel_data_free(void *data_)
{
  msl_accel_data *data = (msl_accel_data *) data_;
  delete data;
}

extern "C" MAXILL_API double* msl_accel_data_get_ll(void *data_)
{
  msl_accel_data *data = (msl_accel_data *) data_;
  return & data->ll[0];
}

extern "C" MAXILL_API void* msl_accel_models_init(void *ctx_, int dim, int Nmodel, int Nsc)
{
  msl_accel_context *ctx = (msl_accel_context *) ctx_;
  msl_accel_models *models = new msl_accel_models;
  if ( models->initialize(ctx, dim, Nmodel, Nsc) ) {
    delete models;
    return NULL;
  }
  return models;
}

extern "C" MAXILL_API void msl_accel_models_free(void *models_)
{
  msl_accel_models *models = (msl_accel_models *) models_;
  delete models;
}

extern "C" MAXILL_API void msl_accel_models_reset(void *models_)
{
  msl_accel_models *models = (msl_accel_models *) models_;
  models->reset();
}

extern "C" MAXILL_API int msl_accel_models_setup_model(void *models_, void *data_, int Ns, int *clazz, double *P0_in, double **K0_in, double **K1_in, double **K2_in,
						       int **Ligand_in, int **Voltage_in, int **Pressure_in, double tdead_sec,
						       eigen_func custom_eigen, callbk_reportfun report_cb, void *report_data)
{
  msl_accel_models *models = (msl_accel_models *) models_;
  msl_accel_data *data = (msl_accel_data *) data_;

  // copy inputs into ublas objects
  matrix<double> K0;         matrix_from_arrays(K0, K0_in, Ns, Ns);
  matrix<double> K1;         matrix_from_arrays(K1, K1_in, Ns, Ns);
  matrix<double> K2;         matrix_from_arrays(K2, K2_in, Ns, Ns);
  matrix<int>    Ligand;     matrix_from_arrays(Ligand, Ligand_in, Ns, Ns);
  matrix<int>    Voltage;    matrix_from_arrays(Voltage, Voltage_in, Ns, Ns);
  matrix<int>    Pressure;   matrix_from_arrays(Pressure, Pressure_in, Ns, Ns);

  //msl_print_timestamp("---setup_model");
  int iModel = models->Nmodel;
  int errcode = models->setup_model(Ns, clazz, K0, K1, K2, Ligand, Voltage, Pressure, tdead_sec, data->Nstim, data->Nsig, & data->stimclasses[0], custom_eigen, report_cb, report_data);
  if ( errcode ) {
    return errcode;
  }
  //msl_print_timestamp("   +P0");
  // setup per-sc P0 in data
  data->initialize_models((int) models->Ns.size(), models->dim);
  matrix<double> P0(1, Ns);
  if ( P0_in ) {
    matrix_from_array(P0, P0_in, 1, Ns);
    double sum = 0.0;
    for (int i=0; i<Ns; ++i)
      sum += P0_in[i];
    for (int i=0; i<Ns; ++i)
      P0(0,i) = (sum > 1e-8 ) ? (P0(0,i) / sum) : (1.0 / Ns);
  }
#pragma omp parallel for firstprivate(P0)
  for (int sci=((int) data->initial_stimclasses.size())-1; sci>=0; --sci) {
    int sc = data->initial_stimclasses[sci];
    if ( ! P0_in ) {
      QtoPe(models->eQQ[sc], P0);
    }
    matrix_to_array(P0, & data->P0[iModel*data->dim_Nsc*models->dim + sc*models->dim], 1, models->dim);
  }
  //msl_print_timestamp("--------------");
  return 0;
}

extern "C" MAXILL_API int msl_accel_models_setup_model_arr(void *models_, void *data_, int Ns, int *clazz, double *P0_in, double *K0_in, double *K1_in, double *K2_in,
							   int *Ligand_in, int *Voltage_in, int *Pressure_in, double tdead_sec,
							   eigen_func custom_eigen, callbk_reportfun report_cb, void *report_data)
{
  std::vector<double*> K0, K1, K2;
  std::vector<int*> Lig, Vol, Pres;
  for (int i=0; i<Ns; ++i) {
    K0.push_back( K0_in + i*Ns );
    K1.push_back( K1_in + i*Ns );
    K2.push_back( K2_in + i*Ns );
    Lig.push_back( Ligand_in + i*Ns );
    Vol.push_back( Voltage_in + i*Ns );
    Pres.push_back( Pressure_in + i*Ns );
  }
  return msl_accel_models_setup_model(models_, data_, Ns, clazz, P0_in, &K0[0], &K1[0], &K2[0], &Lig[0], &Vol[0], &Pres[0], tdead_sec, custom_eigen, report_cb, report_data);
}

extern "C" MAXILL_API double* msl_accel_models_get_ll(void *models_)
{
  msl_accel_models *models = (msl_accel_models *) models_;
  return & models->ll[0];
}

#define MSL_DEFAULT_OCL_SHARE 0.1
#define MSL_OCL_SHARE_INCR 0.01
#define MSL_OCL_SHARE_FRAC_LOW 0.02
#define MSL_OCL_SHARE_FRAC_HIGH 0.05

int timedelta_ms(struct timeval *a, struct timeval *b)
{
  return (int) ( (b->tv_sec - a->tv_sec)*1000 + (b->tv_usec - a->tv_usec)/1000 );
}

extern "C" MAXILL_API int msl_accel_ll(void *ctx_, void *data_, void *models_, int accel)
{
  msl_accel_context *ctx = (msl_accel_context *) ctx_;
  msl_accel_data *data = (msl_accel_data *) data_;
  msl_accel_models *models = (msl_accel_models *) models_;
  int rtn = 0;
  
  int dim = models->dim;

  int models_done = 0;
  if ( accel > ctx->accel )
    accel = ctx->accel;
  
#ifdef QUBX_MAC_OPENCL
  int Nevent = 0;
  for (int i=0; i<data->Nseg; ++i)
    Nevent += data->Ndwells[i];
  double ocl_share = ctx->ocl_share[models->Ns[0]][models->Nmodel][(int)log10((double)Nevent)];
  if ( ocl_share == 0.0 )
    ocl_share = MSL_DEFAULT_OCL_SHARE;
  int ocl_count = (int) (0.5 + models->Nmodel * ocl_share);
  struct timeval tm_pre, tm_cpu, tm_done;
  gettimeofday(&tm_pre, NULL);

  if ( (accel == MSL_ACCEL_OPENCL) && (dim <= 16) && ocl_count ) {

    //msl_print_timestamp("queueing data, models");
    if ( (rtn = data->send_data(ocl_count)) ) {
      return rtn;
    }
    if ( (rtn = models->send_models(ocl_count)) ) {
      return rtn;
    }
    //msl_print_timestamp("begin opencl");
    
    int kern_ix = ((dim == 4) ? QUBX_4
		   : ((dim == 8) ? QUBX_8
		      : QUBX_16));
    size_t local_dim[2] = {(size_t) dim, (size_t) dim};
    size_t global_dim[2] = {((size_t)(dim*data->Nseg)), ((size_t)(dim*ocl_count))};
    
    cl_kernel msl_kernel = ctx->ocl_kernel[kern_ix];
    if ( (rtn = clSetKernelArg(msl_kernel, 0, sizeof(short), & data->Nseg)) ) {
      cerr << "clSetKernelArg(msl, 0): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 1, sizeof(short), & ocl_count)) ) {
      cerr << "clSetKernelArg(msl, 1): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 2, sizeof(short), & models->dim_Nsc)) ) {
      cerr << "clSetKernelArg(msl, 2): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 3, sizeof(cl_mem), & data->ocl_ndwells)) ) {
      cerr << "clSetKernelArg(msl, 3): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 4, sizeof(cl_mem), & data->ocl_seg_off)) ) {
      cerr << "clSetKernelArg(msl, 5): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 5, sizeof(cl_mem), & data->ocl_dwells)) ) {
      cerr << "clSetKernelArg(msl, 6): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 6, sizeof(cl_mem), & data->ocl_P0)) ) {
      cerr << "clSetKernelArg(msl, 6): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 7, sizeof(cl_mem), & data->ocl_ll)) ) {
      cerr << "clSetKernelArg(msl, 7): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 8, sizeof(cl_mem), & models->ocl_Ns)) ) {
      cerr << "clSetKernelArg(msl, 8): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 9, sizeof(cl_mem), & models->ocl_clazz)) ) {
      cerr << "clSetKernelArg(msl, 9): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 10, sizeof(cl_mem), & models->ocl_Adead)) ) {
      cerr << "clSetKernelArg(msl, 10): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 11, sizeof(cl_mem), & models->ocl_eQaa_U)) ) {
      cerr << "clSetKernelArg(msl, 11): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 12, sizeof(cl_mem), & models->ocl_eQaa_lambda)) ) {
      cerr << "clSetKernelArg(msl, 12): error " << rtn << endl;
      return rtn;
    }
    //msl_print_timestamp("start msl kernel");
    if ( (rtn = clEnqueueNDRangeKernel(ctx->ocl_queue, msl_kernel, 2, 0,
				       (const size_t *) &global_dim, (const size_t *) &local_dim, 0, NULL, NULL)) ) {
      cerr << "clEnqueueNDRangeKernel(msl_kernel): error " << rtn << endl;
      return rtn;
    }
    //msl_print_timestamp("end msl kernel");
    models_done = ocl_count;
  } else {
    ocl_count = 0;
  }
#endif

  if ( models_done < models->Nmodel ) {
    // todo split openmp from accel=0
    if ( (rtn = msl_openmp_kernel(ctx, data, models, models_done, models->Nmodel - models_done)) )
      return rtn;
    models_done = models->Nmodel;
  }

#ifdef QUBX_MAC_OPENCL
  gettimeofday(&tm_cpu, NULL);
  if ( ocl_count ) {
    if ( (rtn = data->read_LL(ocl_count)) )
      return rtn;
    //msl_print_timestamp("read ll");
    gettimeofday(&tm_done, NULL);
    double frac_waiting = timedelta_ms(&tm_cpu, &tm_done) * 1.0 / timedelta_ms(&tm_pre, &tm_cpu);
    if ( ocl_share > 0.0 && frac_waiting > MSL_OCL_SHARE_FRAC_HIGH )
      ocl_share -= MSL_OCL_SHARE_INCR;
    else if ( ocl_share < 1.0 && frac_waiting < MSL_OCL_SHARE_FRAC_LOW )
      ocl_share += MSL_OCL_SHARE_INCR;
    ctx->ocl_share[models->Ns[0]][models->Nmodel][(int)log10((double)Nevent)] = ocl_share;
  }
#endif

  // summarize per-model ll
  for (int i=0; i<models->Nmodel; ++i) {
    double sum = 0.0;
    double *lli = & data->ll[i*data->Nseg];
    for (int s=0; s<data->Nseg; ++s, ++lli)
      sum += *lli;
    models->ll[i] = finite(sum) ? sum : -1e22;
  }
  
  return 0;
}

extern "C" MAXILL_API int mil_accel_ll(void *ctx_, void *data_, void *models_, int accel)
{
  msl_accel_context *ctx = (msl_accel_context *) ctx_;
  msl_accel_data *data = (msl_accel_data *) data_;
  msl_accel_models *models = (msl_accel_models *) models_;
  int rtn = 0;
  
  int models_done = 0;
  if ( accel > ctx->accel )
    accel = ctx->accel;

#ifdef QUBX_MAC_OPENCL_DIS
  int dim = models->dim;
  int Nevent = 0;
  for (int i=0; i<data->Nseg; ++i)
    Nevent += data->Ndwells[i];
  double ocl_share = ctx->ocl_share[models->Ns[0]][models->Nmodel][(int)log10((double)Nevent)];
  if ( ocl_share == 0.0 )
    ocl_share = MSL_DEFAULT_OCL_SHARE;
  int ocl_count = (int) (0.5 + models->Nmodel * ocl_share);
  struct timeval tm_pre, tm_cpu, tm_done;
  gettimeofday(&tm_pre, NULL);

  if ( (accel == MSL_ACCEL_OPENCL) && (dim <= 16) && ocl_count ) {

    //msl_print_timestamp("queueing data, models");
    if ( (rtn = data->send_data(ocl_count)) ) {
      return rtn;
    }
    if ( (rtn = models->send_models(ocl_count)) ) {
      return rtn;
    }
    //msl_print_timestamp("begin opencl");
    
    int kern_ix = ((dim == 4) ? QUBX_4
		   : ((dim == 8) ? QUBX_8
		      : QUBX_16));
    size_t local_dim[2] = {dim, dim};
    size_t global_dim[2] = {dim*data->Nseg, dim*ocl_count};
    
    cl_kernel msl_kernel = ctx->ocl_kernel[kern_ix];
    if ( (rtn = clSetKernelArg(msl_kernel, 0, sizeof(short), & data->Nseg)) ) {
      cerr << "clSetKernelArg(msl, 0): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 1, sizeof(short), & ocl_count)) ) {
      cerr << "clSetKernelArg(msl, 1): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 2, sizeof(short), & models->dim_Nsc)) ) {
      cerr << "clSetKernelArg(msl, 2): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 3, sizeof(cl_mem), & data->ocl_ndwells)) ) {
      cerr << "clSetKernelArg(msl, 3): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 4, sizeof(cl_mem), & data->ocl_seg_off)) ) {
      cerr << "clSetKernelArg(msl, 5): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 5, sizeof(cl_mem), & data->ocl_dwells)) ) {
      cerr << "clSetKernelArg(msl, 6): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 6, sizeof(cl_mem), & data->ocl_P0)) ) {
      cerr << "clSetKernelArg(msl, 6): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 7, sizeof(cl_mem), & data->ocl_ll)) ) {
      cerr << "clSetKernelArg(msl, 7): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 8, sizeof(cl_mem), & models->ocl_Ns)) ) {
      cerr << "clSetKernelArg(msl, 8): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 9, sizeof(cl_mem), & models->ocl_clazz)) ) {
      cerr << "clSetKernelArg(msl, 9): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 10, sizeof(cl_mem), & models->ocl_Adead)) ) {
      cerr << "clSetKernelArg(msl, 10): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 11, sizeof(cl_mem), & models->ocl_eQaa_U)) ) {
      cerr << "clSetKernelArg(msl, 11): error " << rtn << endl;
      return rtn;
    }
    if ( (rtn = clSetKernelArg(msl_kernel, 12, sizeof(cl_mem), & models->ocl_eQaa_lambda)) ) {
      cerr << "clSetKernelArg(msl, 12): error " << rtn << endl;
      return rtn;
    }
    //msl_print_timestamp("start msl kernel");
    if ( (rtn = clEnqueueNDRangeKernel(ctx->ocl_queue, msl_kernel, 2, 0,
				       (const size_t *) &global_dim, (const size_t *) &local_dim, 0, NULL, NULL)) ) {
      cerr << "clEnqueueNDRangeKernel(msl_kernel): error " << rtn << endl;
      return rtn;
    }
    //msl_print_timestamp("end msl kernel");
    models_done = ocl_count;
  } else {
    ocl_count = 0;
  }
#endif

  if ( models_done < models->Nmodel ) {
    // todo split openmp from accel=0
    if ( (rtn = mil_openmp_kernel(ctx, data, models, models_done, models->Nmodel - models_done)) )
      return rtn;
    models_done = models->Nmodel;
  }

#ifdef QUBX_MAC_OPENCL_DIS
  gettimeofday(&tm_cpu, NULL);
  if ( ocl_count ) {
    if ( (rtn = data->read_LL(ocl_count)) )
      return rtn;
    //msl_print_timestamp("read ll");
    gettimeofday(&tm_done, NULL);
    double frac_waiting = timedelta_ms(&tm_cpu, &tm_done) * 1.0 / timedelta_ms(&tm_pre, &tm_cpu);
    if ( ocl_share > 0.0 && frac_waiting > MSL_OCL_SHARE_FRAC_HIGH )
      ocl_share -= MSL_OCL_SHARE_INCR;
    else if ( ocl_share < 1.0 && frac_waiting < MSL_OCL_SHARE_FRAC_LOW )
      ocl_share += MSL_OCL_SHARE_INCR;
    ctx->ocl_share[models->Ns[0]][models->Nmodel][(int)log10((double)Nevent)] = ocl_share;
  }
#endif


  // summarize per-model ll
  for (int i=0; i<models->Nmodel; ++i) {
    double sum = 0.0;
    double *lli = & data->ll[i*data->Nseg];
    for (int s=0; s<data->Nseg; ++s, ++lli)
      sum += *lli;
    models->ll[i] = finite(sum) ? sum : -1e22;
  }
  
  return 0;
}