]> ruin.nu Git - germs.git/blob - src/genealgorithms.cpp
calculate the proper distance, with hurdles
[germs.git] / src / genealgorithms.cpp
1 /***************************************************************************
2  *   Copyright (C) 2006 by Michael Andreen                                 *
3  *   andreen@student.chalmers.se                                           *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA          *
19  ***************************************************************************/
20
21 #include "genealgorithms.h"
22 #include "geneorder.h"
23 #include "componenttree.h"
24
25 #include <algorithm>
26 #include <set>
27 #include <stack>
28 #include <cstdlib>
29 #include <iostream>
30 using namespace std;
31
32 std::pair<int,int> longestSequences(const GeneOrder& go){
33         vector<vector<int> > v = robinsonSchensted(go);
34         return pair<int,int>(v[0].size(),v.size());
35 }
36
37 std::vector<std::vector<int> > robinsonSchensted(const GeneOrder& go){
38         vector<vector<int> > v;
39         for (GeneOrder::iterator i = go.begin(); i != go.end(); ++i){
40                 int n = abs(*i);
41                 bool added = false;
42                 for (vector<vector<int> >::iterator vs = v.begin();
43                                 vs != v.end(); ++vs){
44                         vector<int>::iterator bigger = upper_bound(vs->begin(),vs->end(),n);
45                         if ( bigger == vs->end()){
46                                 vs->push_back(n);
47                                 added = true;
48                                 break;
49                         }else{
50                                 swap(n,*bigger);
51                         }
52                 }
53                 if (!added){
54                         v.push_back(vector<int>());
55                         v.back().push_back(n);
56                 }
57         }
58         return v;
59 }
60
61 struct FindP{
62         size_t p;
63         FindP(size_t p) : p(p) {}
64         bool operator()(Interval i){
65                 return (i.first == p || i.second == p);
66         }
67 };
68
69
70 size_t countCycles(const GeneOrder& go){
71         size_t cycles = 0;
72         set<size_t> marked;
73         vector<Interval> intervals = findIntervals(go);
74         vector<Interval> points = findIntervalsAtPoints(intervals);
75         for (size_t p = 1; p < go.size(); ++p){
76                 if (marked.find(p) != marked.end())
77                         continue;
78                 Interval i = intervals[points[p].first];
79                 while (marked.find(p) == marked.end()){
80                         marked.insert(p);
81                         if (i == intervals[points[p].first])
82                                 i = intervals[points[p].second];
83                         else
84                                 i = intervals[points[p].first];
85
86                         if (p == i.first)
87                                 p = i.second;
88                         else
89                                 p = i.first;
90                 }
91                 ++cycles;
92         }
93         return cycles;
94 }
95
96 size_t inversionDistance(const GeneOrder& go){
97         size_t distance = go.size() - 1;
98         distance -= countCycles(go);
99
100         ComponentTree t(findComponents(go));
101         t.makeUnoriented();
102         size_t leaves = t.countLeaves();
103         distance += leaves;
104         if (leaves % 2 != 0){
105                 size_t sb = t.shortBranches();
106                 if (sb == 0)
107                         distance += 1;
108         }
109
110         return distance;
111 }
112
113 int sign(Gene g){
114         if (g > 0)
115                 return 1;
116         if (g < 0)
117                 return -1;
118         return 0;
119 }
120
121 struct Abs{
122         Gene operator()(Gene x) const{
123                 return abs(x);
124         }
125 };
126 std::vector<Component> findComponents(const GeneOrder& go){
127         vector<Component> components;
128         vector<int> os(go.size()-1);
129         for (size_t i = 0; i < os.size(); ++i)
130                 os[i] = (go[i]*go[i+1] > 0 ? sign(go[i]) : 0);
131         stack<Gene> Mdir;
132         Mdir.push(go.size()-1);
133         stack<Gene> Mrev;
134         Mrev.push(0);
135         stack<size_t> Sdir;
136         Sdir.push(0);
137         stack<size_t> Srev;
138         Srev.push(0);
139         vector<Gene> dir;
140         dir.push_back(go.size()-1);
141         vector<Gene> rev;
142         rev.push_back(0);
143         size_t s;
144         vector<Gene> p(go.list());
145         transform(p.begin(),p.end(),p.begin(),Abs());
146         for (size_t i = 1; i < go.size(); ++i){
147                 //Directed
148                 if (p[i-1] > p[i])
149                         Mdir.push(p[i-1]);
150                 else while (Mdir.top() < p[i])
151                         Mdir.pop();
152                 dir.push_back(Mdir.top());
153
154                 s = Sdir.top();
155                 while(p[Sdir.top()] > p[i] || dir[Sdir.top()] < p[i]){
156                         Sdir.pop();
157                         os[Sdir.top()] = (os[Sdir.top()] == os[s] ? os[s] : 0);
158                         s = Sdir.top();
159                 }
160                 if (go[i] > 0 && dir[i] == dir[s] && static_cast<Gene>(i - s) == p[i] - p[s])
161                         components.push_back(Component(p[s],p[i],(s+1 == i ? 0 : os[s]),s,i));
162
163                 //Reverse
164                 if (p[i-1] < p[i])
165                         Mrev.push(p[i-1]);
166                 else while (Mrev.top() > p[i])
167                         Mrev.pop();
168                 rev.push_back(Mrev.top());
169
170                 s = Srev.top();
171                 while((p[s] < p[i] || rev[s] > p[i]) && s > 0){
172                         Srev.pop();
173                         os[Srev.top()] *= (os[Srev.top()] == os[s] ? 1 : 0);
174                         s = Srev.top();
175                 }
176                 if (go[i] < 0 && rev[i] == rev[s] && static_cast<Gene>(i - s) == p[s] - p[i])
177                         components.push_back(Component(-p[s],-p[i],(s+1 == i ? 0 : os[s]),s,i));
178
179                 //Update stacks
180                 if (go[i] > 0)
181                         Sdir.push(i);
182                 else
183                         Srev.push(i);
184         }
185         return components;
186 }
187
188 int sign2(Gene g){
189         if (g < 0)
190                 return -1;
191         return 1;
192 }
193 /**
194  * 
195  */
196 std::vector<Interval> findIntervals(const GeneOrder& go){
197         const size_t max = go.size();
198         vector<Interval> intervals(go.size()-1,Interval(max,max,false));
199         size_t n = 0;
200         const GeneOrder::iterator end = go.end();
201         for (GeneOrder::iterator g = go.begin(); g != end; ++g, ++n){
202                         size_t i = abs(*g);
203                         if (i < max - 1){
204                                 Interval& curr = intervals[i];
205                                 curr.first = n + (*g >= 0 ? 1 : 0);
206
207                                 if (curr.second == max)
208                                         curr.oriented = *g < 0;
209                                 else
210                                         curr.oriented ^=  *g < 0;
211                         }
212                         if (i > 0){
213                                 Interval& prev = intervals[i-1];
214                                 prev.second = n + (*g < 0 ? 1 : 0);
215
216                                 if (prev.first == max)
217                                         prev.oriented = *g < 0;
218                                 else
219                                         prev.oriented ^=  *g < 0;
220                         }
221
222         }
223         return intervals;
224 }
225
226 /**
227  * 
228  */
229 std::vector<Interval> findIntervalsAtPoints(const vector<Interval>& intervals){
230         size_t max = intervals.size()+1;
231         vector<Interval> points(max,Interval(max,max,false));
232         size_t n = 0;
233         for (vector<Interval>::const_iterator i = intervals.begin(); i != intervals.end(); ++i, ++n){
234                 if (points[i->first].first == max){
235                         points[i->first].first = n;
236                 }else
237                         points[i->first].second = n;
238                         
239                 if (points[i->second].first == max){
240                         points[i->second].first = n;
241                 }else
242                         points[i->second].second = n;
243         }
244         return points;
245
246 }