00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00028 #ifndef SEQPP_PHASEDMARKOV_H
00029 #define SEQPP_PHASEDMARKOV_H
00030
00031 using namespace std;
00032 #include <seqpp/SequenceSet.h>
00033 #include <seqpp/arnoldi.h>
00034 #include <vector>
00035 #include <string>
00036
00037 #include <gsl/gsl_sf_gamma.h>
00038
00063 class PhasedMarkov
00064 {
00065
00066 public :
00067
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085 PhasedMarkov( const string & markov_file,
00086 bool calc_rank = false );
00087
00089
00096 PhasedMarkov( const SequenceSet & seqset,
00097 short phase, short initial_phase = 0,
00098 bool calc_rank = false,
00099 const string& prior_alpha_file = string() );
00100
00102
00109 PhasedMarkov( const Sequence & seq,
00110 short phase, short initial_phase = 0,
00111 bool calc_rank = false,
00112 const string& prior_alpha_file = string() );
00113
00115 PhasedMarkov( const PhasedMarkov & phm );
00116
00118 PhasedMarkov();
00119
00121
00129 PhasedMarkov( short size, short order, short phase, bool alloc=true,
00130 const string& prior_alpha_file = string() );
00131
00133
00134
00135
00136
00137
00138 PhasedMarkov(const PhasedMarkov &M1, const PhasedMarkov &M2, const float p);
00139
00141
00149 PhasedMarkov(const SequenceSet & seqset,
00150 const vector<int> &Indseq,
00151 short phase, short initial_phase = 0,
00152 bool calc_rank = false,
00153 const string& prior_alpha_file = string() );
00154
00156
00179 PhasedMarkov( const gsl_rng * r,
00180 short size, short order,
00181 short phase,
00182 bool calc_rank = false );
00183
00185
00194 PhasedMarkov( unsigned long * * count,
00195 short size, short order,
00196 short phase, short initial_phase = 0,
00197 bool calc_rank = false,
00198 const string& prior_alpha_file = string() );
00199
00201 virtual ~PhasedMarkov();
00202
00203
00205
00214 template <class TSeq>
00215 void estimate( const TSeq & tseq,
00216 short phase, short initial_phase,
00217 unsigned long beg, unsigned long end,
00218 bool calc_rank = false, bool count_again = true )
00219 {
00220 if (count_again)
00221 tseq.clear_count();
00222 tseq.count_p_occurencies(_phase,initial_phase,beg,end);
00223
00224 estimate( tseq.get_p_count(), true, calc_rank );
00225 }
00226
00227
00229
00233 void estimate( const string& count_file, bool calc_rank= false )
00234 {
00235 unsigned long ** count = new unsigned long*[_phase];
00236 for (short p=0; p<_phase; p++ )
00237 count[p] = new unsigned long[_nPi];
00238
00239 if (!load_count( count_file, count )){
00240 cerr<<"Bad count file"<<endl;
00241 exit(1);
00242 }
00243
00244 estimate( count, false, calc_rank );
00245
00246 for (short p=0; p<_phase; p++ )
00247 delete[] count[p];
00248 delete [] count;
00249 }
00250
00252
00257 void estimate( unsigned long * * count, bool decal_required, bool calc_rank= false );
00258
00259
00260
00262 const double ** markov_matrices( ) const{
00263 return const_cast< const double** >(_Pis);
00264 }
00266 const double * markov_matrix( short numphase ) const{
00267 if ( numphase<_phase )
00268 return _Pis[numphase];
00269 else
00270 return NULL;
00271 }
00272
00274
00293 void draw_markov_matrices(const gsl_rng * r);
00294
00295
00297 inline virtual void new_markov_matrices();
00299 inline virtual void free_markov_matrices();
00300
00302 double total_variation( const PhasedMarkov & M );
00303
00304
00305
00307
00308
00309
00310 void compute_stat_laws( bool force = false );
00312 const double * stat_law( short numphase = 0 ) const{
00313 if ( numphase<_phase )
00314 return _Mus[numphase];
00315 else
00316 return NULL;
00317 }
00319 inline void free_stat_laws();
00320
00322
00323
00324
00325
00326 void compute_init_law( double * MuInit, const SequenceSet & seqset ) const;
00327
00328
00329
00331 virtual int compute_rank();
00333 virtual long nb_parameters() const{
00334 return _nb_param;
00335 }
00336
00337
00338
00340 void link_to_translator( const Translator & trans ){
00341 _trans = &trans;
00342 }
00344
00349 double proba_c( const string & word,
00350 Coder & coder, short numphase=0 ) const;
00352
00357 double proba( const string & word,
00358 Coder & coder, short numphase=0 ) const;
00359
00360
00362
00367 double proba_c( const vector<short> & word,
00368 Coder & coder, short numphase=0 ) const;
00370
00375 double proba( const vector<short> & word,
00376 Coder & coder, short numphase=0 ) const;
00377
00378
00380
00386 double proba_c( long word, int lw=-1, long jump=-1, short numphase=0 ) const;
00388
00394 double proba( long word, int lw=-1, long jump=-1, short numphase=0 ) const;
00395
00396
00398
00404 double proba_c(const long * seq, long tbeg, long tend, short numphase=0) const;
00405
00407
00413 double proba(const long * seq, long tbeg, long tend, short numphase=0) const;
00414
00416
00422 double proba_c(const Sequence& seq, long tbeg, long tend, short numphase=0) const{
00423 return proba_c(seq.seq(), tbeg, tend, numphase);
00424 }
00425
00427
00433 double proba(const Sequence& seq, long tbeg, long tend, short numphase=0) const{
00434 return proba(seq.seq(), tbeg, tend, numphase);
00435 }
00436
00437
00438
00440
00445 double log_likelihood( const SequenceSet & seqset,
00446 short initial_phase = 0, short numphase=-1 ) const;
00447
00449
00458 double log_ratio_likelihood( const SequenceSet & seqset,
00459 const PhasedMarkov &M,
00460 short initial_phase1=0,
00461 short initial_phase2=0 ) const ;
00462
00463
00465
00470 double log_likelihood( const Sequence & seq,
00471 short initial_phase = 0, short numphase=-1 ) const;
00472
00474
00483 double log_ratio_likelihood( const Sequence & seq,
00484 const PhasedMarkov &M,
00485 short initial_phase1=0,
00486 short initial_phase2=0 ) const ;
00487
00489
00493 template <class TSeq>
00494 double BIC( const TSeq & tseq, short initial_phase=0 ) const{
00495
00496 return( -2*log_likelihood( tseq, initial_phase ) + _nb_param*log( (double)tseq.tell_length() ) );
00497 }
00498
00499
00501
00505 template <class TSeq>
00506 double AIC( const TSeq & tseq, short initial_phase=0 ) const{
00507
00508 return( -2*log_likelihood( tseq, initial_phase ) + 2*_nb_param );
00509 }
00510
00511
00512
00514
00521 template <class TSeq1, class TSeq2>
00522 double post_log_likelihood( const TSeq1& tseq_train, const TSeq2& tseq_eval,
00523 bool force=false,
00524 short initial_phase_train = 0, short initial_phase_eval = 0 )
00525 {
00526 double LogLik_te = 0., LogLik_t = 0.,sum1=0., sum2=0.;
00527 short p, u;
00528 long j, word;
00529
00530 if (( tseq_train.tell_order() != _order )||( tseq_eval.tell_order() != _order )){
00531 cerr << "Order incompatibility beetween sequence and model " <<endl;
00532 exit(-1);
00533 }
00534
00535 if (_prior_alpha.size() == 0){
00536 cerr << "Prior input required for post_log_likelihood." <<endl;
00537 exit(-1);
00538 }
00539
00540
00541
00542 tseq_train.count_p_occurencies( _phase, initial_phase_train );
00543 tseq_eval.count_p_occurencies(_phase, initial_phase_eval);
00544
00545 unsigned long * countp_t, * countp_e;
00546 long jump = tseq_train.tell_jump();
00547
00548
00549 vector<double> lng1;
00550 lng1.reserve(_size);
00551 for (p=0; p<_phase; p++){
00552
00553 double sum = 0;
00554 for (u=0; u<_size; u++){
00555 lng1[u] = gsl_sf_lngamma(_prior_alpha[p][u]);
00556 sum += _prior_alpha[p][u];
00557 }
00558 double lng2 = gsl_sf_lngamma(sum);
00559
00560 countp_t = (tseq_train.get_p_count())[p] + jump;
00561 countp_e = (tseq_eval.get_p_count())[p] + jump;
00562
00563 for(j = 0; j < _nMu; j++){
00564 sum1=0., sum2=0.;
00565 word = j*_size;
00566 for (u=0; u<_size; u++){
00567 sum1 = countp_t[word+u] + countp_e[word+u] + _prior_alpha[p][u];
00568 LogLik_te += gsl_sf_lngamma(sum1) - lng1[u];
00569 sum2 += sum1;
00570 }
00571 LogLik_te += lng2 - gsl_sf_lngamma(sum2);
00572 }
00573 }
00574
00575 if ((_postloglike>0)||(force==true)){
00576 for (p=0; p<_phase; p++){
00577
00578 double sum = 0;
00579 for (u=0; u<_size; u++){
00580 lng1[u] = gsl_sf_lngamma(_prior_alpha[p][u]);
00581 sum += _prior_alpha[p][u];
00582 }
00583 double lng2 = gsl_sf_lngamma(sum);
00584
00585 countp_t = (tseq_train.get_p_count())[p] + jump;
00586 for(j = 0; j < _nMu; j++){
00587 sum1=0., sum2=0.;
00588 word = j*_size;
00589 for (u=0; u<_size; u++){
00590 sum1 = countp_t[word+u] + _prior_alpha[p][u];
00591 LogLik_t += gsl_sf_lngamma(sum1) - lng1[u];
00592 sum2 += sum1;
00593 }
00594 LogLik_t += lng2 - gsl_sf_lngamma(sum2);
00595 }
00596 }
00597 _postloglike = LogLik_t;
00598 }
00599
00600 return(LogLik_te - _postloglike);
00601 }
00602
00603
00604
00605
00607
00628 inline void print( const string &FileOut );
00629
00631 void print(ofstream &Out) const;
00632
00633
00635 int tell_size() const { return(_size);};
00637 int tell_rank() const { return(_rank);};
00639 int tell_order() const { return(_order);};
00641 int tell_phase() const { return(_phase); }
00642
00643
00645 int nMu() const { return(_nMu); }
00647 int nPi() const { return(_nPi); }
00648
00650
00654 double Pi(int index, int p=0) const {
00655 return(_Pis[p][index]);
00656 }
00658
00662 double & operator() (int index, int p=0){
00663 return(_Pis[p][index]);
00664 }
00665
00667
00671 double Mu(int index, int p=0) const {
00672 return(_Mus[p][index]);
00673 }
00675 bool isPis() const { return(_Pis != NULL);};
00677 bool isMus() const { return(_Mus != NULL);};
00678
00679
00681 inline short nextPhase(short p) const;
00683 inline short prevPhase(short p) const;
00684
00685
00687 bool Stochasticity();
00688
00690
00694 void file_to_count( const string& src_file, unsigned long ** dest_count);
00695
00696 protected :
00697
00699 short _phase;
00700
00702 double **_Pis;
00704 double **_containers;
00705
00707 double **_Mus;
00708
00710 short _size;
00711
00713 short _order;
00714
00716 long _nPi;
00717
00719 long _nMu;
00720
00722 long _nb_param;
00723
00725 int _rank;
00726
00728 long _jump;
00729
00731 short *_nextPhase;
00732
00734 short *_prevPhase;
00735
00737 const Translator * _trans;
00738
00740 double _postloglike;
00741
00743 vector< vector<double> > _prior_alpha;
00744
00745
00746 inline void FreeNextPhase();
00747 inline void FreePrevPhase();
00748
00750 bool isNextPhase() const { return(_nextPhase != NULL);};
00752 bool isPrevPhase() const { return(_prevPhase != NULL);};
00753
00754 bool load_count( const string & count_file, unsigned long** data );
00755
00756 bool load_prior_alpha( const string & prior_alpha_file );
00757 };
00758
00759
00760 inline short PhasedMarkov::nextPhase(short p) const
00761 {
00762 return (_nextPhase[p]);
00763 }
00764
00765 inline short PhasedMarkov::prevPhase(short p) const
00766 {
00767 return (_prevPhase[p]);
00768 }
00769
00770 inline void PhasedMarkov::new_markov_matrices()
00771 {
00772 if (!_Pis){
00773 _containers = new double*[_phase];
00774 _Pis = new double*[_phase];
00775 for (short p=0; p<_phase; p++){
00776 _containers[p] = new double[_nPi];
00777 _Pis[p] = _containers[p];
00778 }
00779 }
00780 }
00781
00782 inline void PhasedMarkov::free_markov_matrices()
00783 {
00784 if (_Pis){
00785 for (int p=0;p<_phase;p++)
00786 delete []_containers[p];
00787 delete []_containers;
00788
00789 delete []_Pis;
00790 }
00791 _Pis = NULL;
00792 _containers = NULL;
00793 }
00794
00795 inline void PhasedMarkov::free_stat_laws()
00796 {
00797 if (_Mus)
00798 {
00799 for (int p=0;p<_phase;p++)
00800 delete []_Mus[p];
00801 delete []_Mus;
00802 }
00803 _Mus = NULL;
00804 }
00805
00806 inline void PhasedMarkov::FreeNextPhase()
00807 {
00808 if (isNextPhase())
00809 delete [] _nextPhase;
00810 _nextPhase = NULL;
00811 }
00812
00813 inline void PhasedMarkov::FreePrevPhase()
00814 {
00815 if (isPrevPhase())
00816 delete [] _prevPhase;
00817 _prevPhase = NULL;
00818 }
00819
00820
00821 inline void PhasedMarkov::print(const string & FileOut )
00822 {
00823 ofstream Out(FileOut.c_str());
00824 print(Out);
00825 }
00826
00827 #endif
00828