Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
bd07c60d
Commit
bd07c60d
authored
Aug 02, 2020
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix_to_FASTA_header
parent
a30165e5
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
1230 additions
and
34 deletions
+1230
-34
guide.py
guide.py
+3
-3
hca.py
hca.py
+921
-0
heap.py
heap.py
+105
-0
ival.py
ival.py
+1
-1
phylo.py
phylo.py
+31
-13
sequence.py
sequence.py
+10
-9
sstruct.py
sstruct.py
+0
-2
test_hca.py
test_hca.py
+94
-0
test_heap.py
test_heap.py
+59
-0
webservice.py
webservice.py
+6
-6
No files found.
guide.py
View file @
bd07c60d
...
...
@@ -288,7 +288,7 @@ def scoreAlignment(aln, substmat = None, gap = -1):
if
gap_here
:
score
=
gap
else
:
score
=
substmat
.
get
(
aln
.
seqs
[
i
][
pos
],
aln
.
seqs
[
j
][
pos
])
score
=
substmat
.
__getitem__
(
aln
.
seqs
[
i
][
pos
],
aln
.
seqs
[
j
][
pos
])
if
min
==
None
:
min
=
score
elif
min
>
score
:
...
...
@@ -317,7 +317,7 @@ def align(seqA, seqB, substMatrix, gap=-1):
# Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback
for
i
in
range
(
1
,
lenA
+
1
):
for
j
in
range
(
1
,
lenB
+
1
):
match
=
S
[
i
-
1
,
j
-
1
]
+
substMatrix
.
get
(
stringA
[
i
-
1
],
stringB
[
j
-
1
])
match
=
S
[
i
-
1
,
j
-
1
]
+
substMatrix
.
__getitem__
(
stringA
[
i
-
1
],
stringB
[
j
-
1
])
delete
=
S
[
i
-
1
,
j
]
+
gap
insert
=
S
[
i
,
j
-
1
]
+
gap
Traceback
[
i
,
j
]
=
numpy
.
argmax
([
match
,
delete
,
insert
])
...
...
@@ -382,7 +382,7 @@ class SubstMatrix():
G -1 -1 1
T -1 -1 -1 1
A C G T
>>> sm.
get
('C', 'T')
>>> sm.
__getitem__
('C', 'T')
-1
"""
def
__init__
(
self
,
alphabet
,
scoremat
=
None
):
...
...
hca.py
0 → 100644
View file @
bd07c60d
import
math
import
numpy
as
np
from
scipy.cluster
import
hierarchy
import
matplotlib.pyplot
as
plt
import
stats
from
heap
import
*
class
PairArray
:
""" Storage for all unordered pairs of indices (a, b) == (b, a) and (a != b), where both a and b are members in the set 0..N.
Space requirement is N * N / 2 - N / 2, e.g. N=8 gives 28 cells """
def
__init__
(
self
,
N
):
self
.
_N
=
N
self
.
_store
=
[
None
for
_
in
range
((
N
*
N
)
//
2
-
(
N
//
2
))]
def
hash
(
self
,
idx1
,
idx2
):
if
idx1
==
idx2
:
return
None
if
idx1
<
idx2
:
return
idx1
*
self
.
_N
+
idx2
-
((
idx1
+
1
)
**
2
//
2
+
(
idx1
+
2
)
//
2
)
else
:
return
idx2
*
self
.
_N
+
idx1
-
((
idx2
+
1
)
**
2
//
2
+
(
idx2
+
2
)
//
2
)
def
__setitem__
(
self
,
idxtuple
,
value
):
self
.
_store
[
self
.
hash
(
idxtuple
[
0
],
idxtuple
[
1
])]
=
value
def
__getitem__
(
self
,
idxtuple
):
return
self
.
_store
[
self
.
hash
(
idxtuple
[
0
],
idxtuple
[
1
])]
class
DNode
:
""" A class for a node in a dendrogram for agglomerative clustering, assuming ultrametricity. """
def
__init__
(
self
,
idx
=
None
,
children
=
[],
dist
=
None
):
"""
Initialise a node by the children it joins (if any).
If linked, children already need to have been created;
their reciprocal linkages will be created/ensured.
Use dist to indicate the distance between each child and this node.
:param idx: index/label of node
:param children: the list of DNode instances making up the children (default empty list)
:param dist: distance from children to this node
"""
self
.
parent
=
None
self
.
idx
=
idx
self
.
children
=
children
self
.
dist
=
dist
if
children
:
self
.
leaves
=
0
self
.
refs
=
[]
for
child
in
children
:
# link each child to this parent
child
.
parent
=
self
self
.
leaves
+=
child
.
leaves
self
.
refs
.
extend
(
child
.
refs
)
else
:
self
.
leaves
=
1
self
.
refs
=
[
idx
]
def
nChildren
(
self
):
"""
Check how many direct children this node has
:return: number of children
"""
if
self
.
children
:
return
len
(
self
.
children
)
else
:
return
0
def
getLeaves
(
self
):
"""
Retrieve all leaves under this node (recursively)
:return: all nodes that do not have children themselves under this node
"""
if
self
.
children
:
leaves
=
[]
for
c
in
self
.
children
:
leaves
.
extend
(
c
.
getLeaves
())
return
leaves
else
:
return
[
self
]
def
getDistances
(
self
,
sort
=
False
):
if
self
.
children
:
d
=
[
self
.
dist
]
for
child
in
self
.
children
:
d
.
extend
(
child
.
getDistances
()
if
not
sort
else
sorted
(
child
.
getDistances
(),
reverse
=
True
))
return
d
else
:
return
[]
def
findNode
(
self
,
idx
):
if
self
.
leaves
==
1
:
return
self
if
idx
in
self
.
refs
else
None
else
:
if
idx
==
self
.
idx
:
return
self
for
child
in
self
.
children
:
catch
=
child
.
findNode
(
idx
)
if
catch
:
return
catch
return
None
def
getParents
(
self
):
if
self
.
parent
==
None
:
return
[]
else
:
pp
=
self
.
parent
.
getParents
()
if
pp
:
pp
.
append
(
self
.
parent
)
return
pp
else
:
return
[
self
.
parent
]
def
getNodes
(
self
,
mindist
=
0
,
internal
=
False
):
if
self
.
leaves
==
1
:
return
[]
if
internal
else
[
self
]
else
:
nodes
=
[]
if
self
.
dist
<
mindist
:
return
self
for
child
in
self
.
children
:
if
child
.
leaves
==
1
and
not
internal
:
nodes
.
extend
(
child
.
getNodes
(
mindist
))
elif
child
.
leaves
>
1
:
if
child
.
dist
>=
mindist
:
nodes
.
extend
(
child
.
getNodes
(
mindist
))
else
:
nodes
.
append
(
child
)
nodes
.
append
(
self
)
return
nodes
def
reIndex
(
self
,
childcount
,
parentcount
):
if
self
.
children
:
# this is a parent node
for
c
in
self
.
children
:
(
childcount
,
parentcount
)
=
c
.
reIndex
(
childcount
,
parentcount
)
self
.
idx
=
parentcount
for
c
in
self
.
children
:
c
.
parent
=
self
parentcount
+=
1
else
:
# this is a leaf node
self
.
idx
=
childcount
childcount
+=
1
return
(
childcount
,
parentcount
)
def
clone
(
self
):
n
=
DNode
(
self
.
idx
,
[
c
.
clone
()
for
c
in
self
.
children
],
self
.
dist
)
n
.
refs
=
self
.
refs
return
n
def
getNodesBarCondition
(
self
,
mindist
=
0
,
idxs
=
[]):
if
self
.
leaves
==
1
:
return
[
self
]
elif
self
.
dist
<
mindist
or
self
.
idx
in
idxs
:
return
[
self
]
else
:
children
=
[]
for
c
in
self
.
children
:
mynodes
=
c
.
getNodesBarCondition
(
mindist
,
idxs
)
children
.
extend
(
mynodes
)
children
.
append
(
self
)
return
children
def
cloneBarCondition
(
self
,
mindist
=
0
,
idxs
=
[],
leakytrace
=
[]):
if
self
.
leaves
==
1
:
n
=
DNode
(
self
.
idx
)
elif
(
self
.
dist
<
mindist
or
self
.
idx
in
idxs
)
and
not
self
.
idx
in
leakytrace
:
n
=
DNode
(
self
.
idx
)
else
:
children
=
[]
for
c
in
self
.
children
:
myclone
=
c
.
cloneBarCondition
(
mindist
,
idxs
,
leakytrace
)
children
.
append
(
myclone
)
if
len
(
children
)
>
1
:
n
=
DNode
(
self
.
idx
,
children
,
self
.
dist
)
else
:
n
=
DNode
(
self
.
idx
)
n
.
refs
=
self
.
refs
return
n
def
__repr__
(
self
):
return
'<'
+
str
(
self
.
idx
)
+
'('
+
str
(
self
.
leaves
)
+
'|'
+
str
(
len
(
self
.
refs
))
+
'):'
+
str
(
self
.
dist
)
+
">"
if
self
.
dist
else
'<'
+
str
(
self
.
idx
)
+
">"
def
__str__
(
self
):
""" Returns string with node (incl descendants) in a Newick style. """
stubs
=
[
''
for
_
in
range
(
self
.
nChildren
())]
dist
=
''
for
i
in
range
(
self
.
nChildren
()):
stubs
[
i
]
=
str
(
self
.
children
[
i
])
if
self
.
dist
or
self
.
dist
==
0.0
:
dist
=
':'
+
str
(
self
.
dist
)
label
=
str
(
self
.
idx
)
if
self
.
nChildren
()
==
0
:
return
label
+
dist
else
:
stubstr
=
'('
for
i
in
range
(
len
(
stubs
)
-
1
):
stubstr
+=
stubs
[
i
]
+
','
return
stubstr
+
stubs
[
-
1
]
+
')'
+
label
+
dist
def
newick
(
self
,
labels
):
""" Returns string with node (incl descendants) in a Newick style. """
stubs
=
[
''
for
_
in
range
(
self
.
nChildren
())]
label
=
dist
=
''
for
i
in
range
(
self
.
nChildren
()):
stubs
[
i
]
=
self
.
children
[
i
]
.
newick
(
labels
)
if
self
.
dist
or
self
.
dist
==
0.0
:
dist
=
':'
+
str
(
self
.
dist
)
if
self
.
idx
!=
None
and
self
.
idx
<
len
(
labels
):
label
=
labels
[
self
.
idx
]
if
self
.
nChildren
()
==
0
:
return
label
+
dist
else
:
stubstr
=
'('
for
i
in
range
(
len
(
stubs
)
-
1
):
stubstr
+=
stubs
[
i
]
+
','
return
stubstr
+
stubs
[
-
1
]
+
')'
+
label
+
dist
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
----------------------------------------------------------------------------------------"""
def
parse
(
string
,
idx2idx
=
{}):
"""
Parse a Newick string to create a (root) node (including descendant nodes)
:param string: Newick formatted string
:param idx2idx: dictionary to translate indices/labels (default empty)
:return: root node of tree described by input string
"""
first
=
string
.
find
(
'('
)
last
=
string
[::
-
1
]
.
find
(
')'
)
# look from the back
if
first
==
-
1
and
last
==
-
1
:
# we are at leaf
y
=
string
.
split
(
':'
)
if
y
[
0
]
in
idx2idx
:
node
=
DNode
(
idx
=
idx2idx
[
y
[
0
]])
else
:
node
=
DNode
(
idx
=
y
[
0
])
if
len
(
y
)
>=
2
:
node
.
dist
=
float
(
y
[
1
])
return
node
elif
first
>=
0
and
last
>=
0
:
# remove parentheses
last
=
len
(
string
)
-
last
-
1
# correct index to refer from start instead of end of string
embed
=
string
[
first
+
1
:
last
]
tail
=
string
[
last
+
1
:]
# find where corresp comma is
commas
=
_findComma
(
embed
)
if
len
(
commas
)
<
1
:
raise
RuntimeError
(
'Invalid format: invalid placement of "," in sub-string "'
+
embed
+
'"'
)
prev_comma
=
0
child_tokens
=
[]
for
comma
in
commas
:
child_tokens
.
append
(
embed
[
prev_comma
:
comma
]
.
strip
())
prev_comma
=
comma
+
1
child_tokens
.
append
(
embed
[
prev_comma
:]
.
strip
())
y
=
tail
.
split
(
':'
)
if
len
(
y
[
0
])
==
0
:
node
=
DNode
()
elif
y
[
0
]
in
idx2idx
:
node
=
DNode
(
idx
=
idx2idx
[
y
[
0
]])
else
:
node
=
DNode
(
idx
=
y
[
0
])
if
len
(
y
)
>=
2
:
node
.
dist
=
float
(
y
[
1
])
node
.
children
=
[]
node
.
leaves
=
0
node
.
refs
=
[]
for
tok
in
child_tokens
:
child
=
parse
(
tok
,
idx2idx
)
child
.
parent
=
node
node
.
leaves
+=
child
.
leaves
node
.
refs
.
extend
(
child
.
refs
)
node
.
children
.
append
(
child
)
return
node
else
:
raise
RuntimeError
(
'Invalid format: unbalanced parentheses in sub-string "'
+
string
+
'"'
)
def
_findComma
(
string
,
level
=
0
):
""" Find all commas at specified level of embedding """
mylevel
=
0
commas
=
[]
for
i
in
range
(
len
(
string
)):
if
string
[
i
]
==
'('
:
mylevel
+=
1
elif
string
[
i
]
==
')'
:
mylevel
-=
1
elif
string
[
i
]
==
','
and
mylevel
==
level
:
commas
.
append
(
i
)
return
commas
class
_RegistryIterator
:
def
__init__
(
self
,
registry
):
assert
len
(
registry
.
_inuse
)
>
1
,
"Must have at least two indices to iterate"
self
.
_registry
=
registry
self
.
_n
=
len
(
registry
.
_inuse
)
self
.
_inuse
=
list
(
registry
.
_inuse
)
self
.
_curi
=
0
self
.
_curj
=
1
def
__iter__
(
self
):
return
self
def
__next__
(
self
):
for
i
in
range
(
self
.
_curi
,
self
.
_n
-
1
):
self
.
_curj
=
max
(
self
.
_curi
+
1
,
self
.
_curj
)
for
j
in
range
(
self
.
_curj
,
self
.
_n
):
address
=
self
.
_registry
.
encode
(
self
.
_inuse
[
i
],
self
.
_inuse
[
j
])
y
=
self
.
_registry
.
_elements
[
address
]
if
y
:
if
j
<
self
.
_n
-
1
:
self
.
_curj
=
j
+
1
else
:
self
.
_curi
=
i
+
1
self
.
_curj
=
self
.
_curi
+
1
return
y
else
:
raise
StopIteration
class
Registry
:
def
__init__
(
self
,
N
):
self
.
N
=
N
# number of leaves
self
.
NwP
=
2
*
N
-
1
# number of leaves and parents
self
.
NR
=
self
.
NwP
**
2
//
2
-
self
.
NwP
//
2
# number of records to keep
self
.
_elements
=
np
.
array
([
None
for
i
in
range
(
self
.
NR
)])
# self.NR records
self
.
_inuse
=
set
([
i
for
i
in
range
(
N
)])
# define what indices that are currently in use
self
.
k
=
N
def
__iter__
(
self
):
return
_RegistryIterator
(
self
)
def
useIndex
(
self
,
idx
):
self
.
_inuse
.
add
(
idx
)
def
removeIndex
(
self
,
idx
):
self
.
_inuse
.
remove
(
idx
)
def
encode
(
self
,
idx1
,
idx2
):
if
idx1
==
idx2
:
return
None
if
idx1
<
idx2
:
return
idx1
*
(
2
*
self
.
N
-
1
)
+
idx2
-
((
idx1
+
1
)
**
2
//
2
+
(
idx1
+
2
)
//
2
)
else
:
return
idx2
*
(
2
*
self
.
N
-
1
)
+
idx1
-
((
idx2
+
1
)
**
2
//
2
+
(
idx2
+
2
)
//
2
)
def
putDNode
(
self
,
dnode
):
if
dnode
.
children
[
0
]
.
idx
==
None
:
dnode
.
children
[
0
]
.
idx
=
self
.
k
self
.
k
+=
1
if
dnode
.
children
[
1
]
.
idx
==
None
:
dnode
.
children
[
1
]
.
idx
=
self
.
k
self
.
k
+=
1
idx1
=
dnode
.
children
[
0
]
.
idx
idx2
=
dnode
.
children
[
1
]
.
idx
address
=
self
.
encode
(
idx1
,
idx2
)
self
.
_elements
[
address
]
=
dnode
def
get
(
self
,
address
):
return
self
.
_elements
[
address
]
def
set
(
self
,
address
,
value
):
self
.
_elements
[
address
]
=
value
def
getDNode
(
self
,
idx1
,
idx2
):
address
=
self
.
encode
(
idx1
,
idx2
)
if
self
.
_elements
[
address
]:
return
self
.
_elements
[
address
]
else
:
return
None
def
decode
(
self
,
address
):
# address = idx1 * (2 * self.N - 1) + idx2 - ((idx1+1)**2 // 2 + (idx1+2) // 2)
dnode
=
self
.
_elements
[
address
]
if
dnode
:
return
[
dnode
.
children
[
0
]
.
idx
,
dnode
.
children
[
1
]
.
idx
]
# idx1 = self._elements[address][0]
# idx2 = address - idx1 * (2 * self.N - 1) + ((idx1+1)**2 // 2 + (idx1+2) // 2)
return
None
def
getClosest
(
self
):
closest
=
None
try
:
for
node
in
self
:
if
closest
==
None
or
node
.
dist
<
closest
.
dist
:
closest
=
node
except
:
pass
return
closest
class
Dendrogram
:
""" A class for a dendrogram for agglomerative clustering, assuming ultrametricity """
def
__init__
(
self
):
self
.
Labels
=
[]
# Labels, if available
self
.
Data
=
[]
# Data, if available
self
.
BaseTree
=
None
# Root of base tree/dendrogram, if available
self
.
defaultview
=
'original'
# current, default view is 'original'
self
.
views
=
{
self
.
defaultview
:
None
}
# root of each view
self
.
means
=
{}
# dict frozenset/refs : list/v
def
loadBaseTree
(
self
,
filename
):
""" Read file on Newick format and set as base tree in this dendrogram. """
f
=
open
(
filename
)
string
=
''
.
join
(
f
)
self
.
BaseTree
=
self
.
parseNewick
(
string
)
self
.
N
=
k
=
len
(
self
.
Labels
)
# re-indexed, viewable version
self
.
BaseTree
.
reIndex
(
0
,
self
.
N
)
self
.
views
[
self
.
defaultview
]
=
self
.
BaseTree
self
.
setView
(
self
.
defaultview
)
def
parseNewick
(
self
,
string
):
""" Main method for parsing a Newick string into a tree.
Handles labels (on both leaves and internal nodes), and includes distances (if provided).
Returns the root node, which defines the tree. """
if
string
.
find
(
';'
)
!=
-
1
:
string
=
string
[:
string
.
find
(
';'
)]
return
parse
(
string
)
def
saveBaseTree
(
self
,
filename
):
"""
Save the base tree that defines the dendrogram; not that distances are in the tree but not in the form of a data matrix
:param filename: name of file, under which the dendrogram base tree is saved
"""
assert
self
.
BaseTree
and
self
.
Labels
!=
None
,
"Invalid tree: cannot save"
with
open
(
filename
,
'w'
)
as
fh
:
print
(
self
.
BaseTree
.
__str__
(
self
.
Labels
),
end
=
""
,
file
=
fh
)
def
loadData
(
self
,
filename
,
headers
=
False
):
"""
Load the data associated with this dendrogram.
:param filename: the name of the tab-separated value file
:param headers: if headers are present in the file, default is False
"""
reindex
=
False
if
self
.
Labels
and
self
.
BaseTree
:
# labels are already available for base tree, so data need to be re-indexed
reindex
=
True
self
.
Data
=
[
None
for
_
in
range
(
len
(
self
.
Labels
))]
with
open
(
filename
)
as
csvfile
:
r
=
csv
.
reader
(
csvfile
,
delimiter
=
'
\t
'
)
headers
=
[]
if
headers
else
None
for
row
in
r
:
if
headers
==
[]:
headers
=
row
[
1
:]
elif
reindex
:
self
.
Data
[
self
.
indexLabel
(
row
[
0
])]
=
[
float
(
y
)
for
y
in
row
[
1
:]]
else
:
self
.
Data
.
append
([
float
(
y
)
for
y
in
row
[
1
:]])
self
.
Labels
.
append
(
row
[
0
])
def
setData
(
self
,
data
,
labels
):
"""
Assign data vectors to labels in this dendrogram; will match with pre-existing tree and overwrite current values if any
:param data: data vectors
:param labels: labels (same index as data)
"""
assert
len
(
data
)
==
len
(
labels
),
"Number of data rows and labels must be the same"
reindex
=
False
if
self
.
Labels
and
self
.
BaseTree
:
# labels are already available for base tree, so data need to be re-indexed
reindex
=
True
self
.
Data
=
[
None
for
_
in
range
(
len
(
self
.
Labels
))]
for
i
in
range
(
len
(
data
)):
if
reindex
:
idx
=
self
.
indexLabel
(
labels
[
i
])
self
.
Data
[
idx
]
=
data
[
idx
]
else
:
self
.
Data
.
append
(
data
[
i
])
self
.
Labels
.
append
(
labels
[
i
])
def
normData
(
self
,
mode
=
None
):
"""
Normalise data
:param mode: normalisation mode, default is None (unit length)
"""
for
i
in
range
(
len
(
self
.
Data
)):
self
.
Data
[
i
]
=
normalise
(
self
.
Data
[
i
])
def
createBaseTree
(
self
,
d
,
labels
=
None
):
" d is the distance matrix, labels is the list of labels "
self
.
N
=
len
(
d
)
self
.
Labels
=
labels
or
[
'X'
+
str
(
i
+
1
)
for
i
in
range
(
self
.
N
)]
registry
=
Registry
(
self
.
N
)
" For each node-pair, assign the distance between them. "
dnodes
=
[
DNode
(
idx
=
i
)
for
i
in
range
(
self
.
N
)]
for
i
in
range
(
self
.
N
):
registry
.
useIndex
(
i
)
for
j
in
range
(
i
+
1
,
self
.
N
):
registry
.
putDNode
(
DNode
(
children
=
[
dnodes
[
i
],
dnodes
[
j
]],
dist
=
d
[
i
,
j
]))
# Treat each node as a cluster, until there is only one cluster left, find the closest pair
# of clusters, and merge that pair into a new cluster (to replace the two that merged).
# In each case, the new cluster is represented by the node that is formed.
closest
=
registry
.
getClosest
()
while
closest
:
# So we know the closest, now we need to merge...
x
=
closest
.
children
[
0
]
y
=
closest
.
children
[
1
]
z
=
closest
# use this node for new cluster z
Nx
=
x
.
leaves
# find number of sequences in x
Ny
=
y
.
leaves
# find number of sequences in y
for
widx
in
registry
.
_inuse
:
# for each node w ...
if
widx
!=
x
.
idx
and
widx
!=
y
.
idx
:
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw_address
=
registry
.
encode
(
x
.
idx
,
widx
)
dyw_address
=
registry
.
encode
(
y
.
idx
,
widx
)
dxw
=
registry
.
get
(
dxw_address
)
# retrieve and remove distance from D: x to w
dyw
=
registry
.
get
(
dyw_address
)
# retrieve and remove distance from D: y to w
w
=
dxw
.
children
[
0
]
if
dxw
.
children
[
0
]
.
idx
==
widx
else
dxw
.
children
[
1
]
dzw
=
DNode
(
children
=
[
z
,
w
],
dist
=
(
Nx
*
dxw
.
dist
+
Ny
*
dyw
.
dist
)
/
(
Nx
+
Ny
))
# distance: z to w
registry
.
set
(
dxw_address
,
None
)
registry
.
set
(
dyw_address
,
None
)
registry
.
putDNode
(
dzw
)
registry
.
useIndex
(
z
.
idx
)
# remove x and y from registry
registry
.
removeIndex
(
x
.
idx
)
registry
.
removeIndex
(
y
.
idx
)
closest
=
registry
.
getClosest
()
self
.
BaseTree
=
z
self
.
BaseTree
.
reIndex
(
0
,
self
.
N
)
# re-indexed, viewable version
self
.
views
[
self
.
defaultview
]
=
self
.
BaseTree
self
.
setView
(
self
.
defaultview
)
def
getRoot
(
self
,
view
=
None
):
if
not
view
:
view
=
self
.
defaultview
return
self
.
views
[
view
]
def
getIndex
(
self
,
label
):
for
i
in
range
(
len
(
self
.
Labels
)):
if
self
.
Labels
[
i
]
==
label
:
return
i
return
-
1
def
getNodeByIndex
(
self
,
idx
,
view
=
None
):
if
idx
!=
-
1
:
return
self
.
getRoot
(
view
)
.
findNode
(
idx
)
else
:
return
None
def
getNodeByName
(
self
,
label
,
view
=
None
):
idx
=
self
.
getIndex
(
label
)
if
idx
!=
-
1
:
return
self
.
getNodeByIndex
(
idx
,
view
)
else
:
return
None
def
_intersect
(
self
,
L1
,
L2
):
return
[
y
for
y
in
L1
if
y
in
L2
]
def
getTraceByMembers
(
self
,
members
=
[],
view
=
None
):
return
self
.
getRootByMembers
(
members
,
view
,
format
=
'trace'
)
def
getRootByMembers
(
self
,
members
=
[],
view
=
None
,
format
=
'root'
):
""" Retrieve the root of the smallest sub-tree that contains all the members. """
prevtrace
=
None
for
member
in
members
:
if
not
isinstance
(
member
,
DNode
):
name
=
member
member
=
self
.
getNodeByName
(
name
,
view
)
if
member
==
None
:
print
(
'Could not find any node with label'
,
name
)
trace
=
member
.
getParents
()
if
prevtrace
:
prevtrace
=
self
.
_intersect
(
prevtrace
,
trace
)
else
:
prevtrace
=
trace
if
format
==
'root'
and
len
(
prevtrace
)
>=
1
:
return
prevtrace
[
-
1
]
elif
format
==
'trace'
:
return
prevtrace
else
:
return
None
def
setView
(
self
,
name
):
self
.
defaultview
=
name
def
createView
(
self
,
name
,
newroot
=
None
,
mindist
=
0
,
terminate_nodes
=
[]):
idxs
=
[
n
.
idx
for
n
in
terminate_nodes
]
newroot
=
newroot
or
self
.
getRoot
()
newtree
=
newroot
.
cloneBarCondition
(
mindist
,
idxs
)
newtree
.
reIndex
(
0
,
len
(
newtree
.
getLeaves
()))
self
.
views
[
name
]
=
newtree
self
.
setView
(
name
)
def
createLeakyView
(
self
,
name
,
trace
=
[],
mindist
=
0
):
"""
Create view from specified minimum distance but all a trace of parent nodes for a subtree to leak through
:param name: name of view
:param trace: list of nodes that all lead to a subtree
:param mindist: the minimum distance for terminating nodes
:return:
"""
idxs
=
[
n
.
idx
for
n
in
trace
]
newtree
=
self
.
getRoot
()
.
cloneBarCondition
(
mindist
,
leakytrace
=
idxs
)
newtree
.
reIndex
(
0
,
len
(
newtree
.
getLeaves
()))
self
.
views
[
name
]
=
newtree
self
.
setView
(
name
)
def
getNodesByEnrichment
(
self
,
positiveLabels
=
[],
view
=
None
,
pval
=
0.05
,
format
=
'nodes only'
):
"""
Get all nodes in the view which are positively enriched for the specified labels.
Assumes that the tree is rooted.
:param positiveLabels: labels
:param view: view
:param pval: maximum p-value for the test to return node
:param format: "nodes only" means a sorted list with nodes, "with p" means a sorted list with tuples (node, p)
:return: a list of nodes or (node, p) tuples, sorted in order of ascending p-value (most significant first), truncated at max p-value
"""
root
=
self
.
getRoot
(
view
)
# set-up reference sets
pos
=
set
()
neg
=
set
()
for
idx
in
root
.
refs
:
label
=
self
.
Labels
[
idx
]
if
label
in
positiveLabels
:
pos
.
add
(
idx
)
else
:
neg
.
add
(
idx
)
if
len
(
pos
)
==
0
:
return
[]
# gather then try all nodes in the view
nodes
=
root
.
getNodes
()
res
=
dict
()
for
n
in
nodes
:
a1
=
a2
=
0
for
idx
in
n
.
refs
:
if
idx
in
pos
:
a1
+=
1
elif
idx
in
neg
:
a2
+=
1
b1
=
len
(
pos
)
-
a1
b2
=
len
(
neg
)
-
a2
p
=
stats
.
getFETpval
(
a1
,
a2
,
b1
,
b2
,
False
)
if
p
<
pval
:
res
[
n
]
=
p
sorted_tuples
=
sorted
(
res
.
items
(),
key
=
lambda
kv
:
kv
[
1
])
sorted_list
=
[
y
[
0
]
for
y
in
sorted_tuples
]
return
sorted_list
if
format
==
'nodes only'
else
sorted_tuples
def
getLinkage
(
self
,
nodes
):
links
=
[]
for
n
in
nodes
:
if
n
.
leaves
>
1
:
links
.
append
([
n
.
children
[
0
]
.
idx
,
n
.
children
[
1
]
.
idx
,
n
.
dist
,
n
.
leaves
])
return
links
black
=
'000000'
red2yellow
=
[
'200000'
,
'400000'
,
'550000'
,
'5f0000'
,
'690000'
,
'730000'
,
'7d0000'
,
'870000'
,
'910000'
,
'9b0000'
,
'a50000'
,
'af0000'
,
'b90000'
,
'c30000'
,
'cd0000'
,
'd70000'
,
'e10000'
,
'eb0000'
,
'f50000'
,
'FF0000'
,
'FF0a00'
,
'FF1400'
,
'FF1e00'
,
'FF2800'
,
'FF3200'
,
'FF3c00'
,
'FF4600'
,
'FF5000'
,
'FF5a00'
,
'FF6400'
,
'FF6e00'
,
'FF7800'
,
'FF8200'
,
'FF8c00'
,
'FF9600'
,
'FFa000'
,
'FFaa00'
,
'FFb400'
,
'FFbe00'
,
'FFc800'
,
'FFd200'
,
'FFdc00'
,
'FFe600'
,
'FFf000'
,
'FFfa00'
]
def
getHighlight
(
self
,
nodes
,
scored_nodes
):
mymax
=
None
mymin
=
None
for
(
node
,
score
)
in
scored_nodes
:
if
not
mymax
or
not
mymin
:
mymax
=
mymin
=
score
else
:
mymax
=
max
(
mymax
,
score
)
mymin
=
min
(
mymin
,
score
)
rate
=
(
mymax
-
mymin
)
/
len
(
self
.
red2yellow
)
print
(
'Min'
,
mymin
,
'Max'
,
mymax
,
'Rate'
,
rate
)
node2score
=
dict
()
for
(
node
,
score
)
in
scored_nodes
:
node2score
[
node
]
=
score
color_link
=
{}
for
n
in
nodes
:
# a node
if
n
.
leaves
>
1
:
# only internal nodes can be colored
try
:
score
=
node2score
[
n
]
step
=
int
((
score
-
mymin
)
/
rate
)
color_link
[
n
.
idx
]
=
'#'
+
self
.
red2yellow
[
step
]
print
(
step
,
end
=
' '
)
except
:
color_link
[
n
.
idx
]
=
'#'
+
self
.
black
print
()
return
color_link
def
getRGB
(
self
,
nodes
,
colored_nodes
=
[],
defaultcol
=
'B0B0B0'
):
assert
len
(
colored_nodes
)
==
3
,
"Must have three lists of nodes to use RGB"
allcolored
=
set
([])
for
color
in
colored_nodes
:
allcolored
.
update
(
color
)
color_link
=
{}
for
n
in
nodes
:
# a node
if
n
.
leaves
>
1
:
# only internal nodes can be colored
color_link
[
n
.
idx
]
=
'#'
if
n
in
allcolored
:
inall
=
True
for
color
in
colored_nodes
:
# lists for Red, Green then Blue
if
n
in
color
:
color_link
[
n
.
idx
]
+=
'E0'
else
:
color_link
[
n
.
idx
]
+=
'00'
inall
=
False
if
inall
:
color_link
[
n
.
idx
]
=
'#101010'
else
:
color_link
[
n
.
idx
]
+=
defaultcol
return
color_link
def
getRefs
(
self
,
nodes
):
"""
Get the group of indices for each node
:param nodes: the nodes of interest
:return: list of reference index lists
"""
groups
=
[]
for
n
in
nodes
:
if
n
.
leaves
==
1
:
groups
.
append
(
n
.
refs
)
return
groups
def
getMeanRefs
(
self
,
refs
):
"""
Determine the average vector for a group of reference indices
:param refs: the index group
:return: the average vector
"""
if
self
.
Data
:
key
=
frozenset
(
refs
)
if
key
in
self
.
means
:
return
self
.
means
[
key
]
m
=
None
for
r
in
refs
:
if
not
m
:
m
=
[
y
for
y
in
self
.
Data
[
r
]]
else
:
for
i
in
range
(
len
(
m
)):
m
[
i
]
+=
self
.
Data
[
r
][
i
]
m
=
[
y
/
len
(
refs
)
for
y
in
m
]
self
.
means
[
key
]
=
m
return
m
else
:
return
None
def
getLabelGroups
(
self
,
nodes
):
"""
Get the list of labels, associated with each node
:param nodes: nodes of interest
:return: the list of label lists
"""
groups
=
[]
for
n
in
nodes
:
if
n
.
leaves
==
1
:
plabels
=
[]
for
i
in
range
(
len
(
n
.
refs
)):
plabels
.
append
(
self
.
Labels
[
n
.
refs
[
i
]])
groups
.
append
(
plabels
)
return
groups
def
getString
(
self
,
v
,
s
,
thresholds
):
assert
len
(
s
)
-
1
==
len
(
thresholds
),
"characters in string s must agree with thresholds + 1"
ret
=
''
for
y
in
v
:
for
i
in
range
(
len
(
thresholds
)):
if
y
<
thresholds
[
i
]:
ret
+=
s
[
i
]
break
if
y
>=
thresholds
[
-
1
]:
ret
+=
s
[
-
1
]
return
ret
def
draw
(
self
,
view
=
None
,
maxlabel
=
15
,
mode
=
'label'
,
thresholds
=
None
,
nodesRGB
=
None
,
nodesHighlight
=
None
):
N
=
self
.
getRoot
(
view
)
.
getNodes
()
groups_wlabels
=
self
.
getLabelGroups
(
N
)
groups_windices
=
self
.
getRefs
(
N
)
labs
=
[
''
for
_
in
groups_wlabels
]
syms
=
' .o0'
if
thresholds
or
'vec'
in
mode
:
for
i
in
range
(
len
(
groups_windices
)):
group
=
groups_windices
[
i
]
meanv
=
self
.
getMeanRefs
(
group
)
if
thresholds
:
labs
[
i
]
+=
self
.
getString
(
meanv
,
syms
,
thresholds
)
else
:
for
y
in
meanv
:
labs
[
i
]
+=
'
%4.2
f '
%
y
if
'lab'
in
mode
:
for
i
in
range
(
len
(
groups_wlabels
)):
group
=
groups_wlabels
[
i
]
label
=
''
if
len
(
labs
[
i
])
==
0
else
' '
label
+=
''
if
len
(
group
)
==
1
else
str
(
len
(
group
))
+
'|'
for
jj
in
range
(
len
(
group
)):
j
=
len
(
group
)
-
1
-
jj
label
+=
group
[
j
]
+
(
''
if
j
==
0
else
':'
)
labs
[
i
]
+=
label
for
i
in
range
(
len
(
labs
)):
if
len
(
labs
[
i
])
>
maxlabel
:
labs
[
i
]
=
labs
[
i
][
0
:
maxlabel
-
2
]
+
'..'
L
=
self
.
getLinkage
(
N
)
Z
=
np
.
array
(
L
)
plt
.
rcParams
[
"font.family"
]
=
"Andale Mono"
if
not
nodesRGB
and
not
nodesHighlight
:
hierarchy
.
dendrogram
(
Z
,
labels
=
labs
,
orientation
=
'left'
)
elif
nodesRGB
:
rgb
=
[[]
for
_
in
range
(
3
)]
for
i
in
range
(
len
(
nodesRGB
)):
rgb
[
i
]
=
nodesRGB
[
i
]
link_cols
=
self
.
getRGB
(
N
,
rgb
)
hierarchy
.
dendrogram
(
Z
,
labels
=
labs
,
orientation
=
'left'
,
link_color_func
=
lambda
row
:
link_cols
[
row
])
else
:
# highlight
highlight
=
[(
node
,
-
math
.
log
(
p
))
for
(
node
,
p
)
in
nodesHighlight
]
link_cols
=
self
.
getHighlight
(
N
,
highlight
)
hierarchy
.
dendrogram
(
Z
,
labels
=
labs
,
orientation
=
'left'
,
link_color_func
=
lambda
row
:
link_cols
[
row
])
def
indexLabel
(
self
,
label
):
if
self
.
Labels
==
None
:
self
.
Labels
=
[]
for
i
in
range
(
len
(
self
.
Labels
)):
if
self
.
Labels
[
i
]
==
label
:
return
i
self
.
Labels
.
append
(
label
)
return
len
(
self
.
Labels
)
-
1
"""
"""
def
eucdist
(
v1
,
v2
):
diff
=
0
for
i
in
range
(
len
(
v1
)):
diff
+=
(
v1
[
i
]
-
v2
[
i
])
**
2
return
math
.
sqrt
(
diff
)
def
cosdist
(
v1
,
v2
):
if
len
(
v1
)
!=
len
(
v2
):
return
None
sum0
=
0
sum1
=
0
sum2
=
0
for
i
in
range
(
len
(
v1
)):
sum0
+=
v1
[
i
]
*
v2
[
i
]
sum1
+=
v1
[
i
]
*
v1
[
i
]
sum2
+=
v2
[
i
]
*
v2
[
i
]
return
1
-
(
sum0
/
(
math
.
sqrt
(
sum1
*
sum2
)))
def
normalise
(
v
):
y
=
sum
(
v
)
return
[
vi
/
y
for
vi
in
v
]
def
euc
(
data
):
d
=
np
.
zeros
((
len
(
data
),
len
(
data
)))
for
i
in
range
(
len
(
data
)):
for
j
in
range
(
i
+
1
,
len
(
data
)):
d
[
i
,
j
]
=
eucdist
(
data
[
i
],
data
[
j
])
return
d
def
norm
(
data
):
ndata
=
[[]
for
_
in
data
]
for
i
in
range
(
len
(
data
)):
ndata
[
i
]
=
normalise
(
data
[
i
])
return
ndata
import
csv
if
__name__
==
'__main__0'
:
dgram
=
Dendrogram
()
dgram
.
loadData
(
'/Users/mikael/simhome/monocyte/test100x10.tsv'
,
headers
=
True
)
d
=
euc
(
dgram
.
Data
)
dgram
.
createBaseTree
(
d
,
dgram
.
Labels
)
#dgram.createView('focus slamf7', dgram.getNodesByMembers(['p3@SLAMF7', 'p2@NFKB2'], view = 'original'))
dgram
.
createView
(
'focus sorl1'
,
dgram
.
getRootByMembers
([
'p2@SORL1'
,
'p1@FAU'
])[
-
1
])
#dgram.createView('focus ddx6', dgram.getNodesByMembers(['p3@DDX6', 'p2@CARS']))
dgram
.
createView
(
'focus sorl1 dist'
,
mindist
=
0.04
)
plt
.
figure
(
num
=
None
,
figsize
=
(
5
,
15
),
dpi
=
200
,
facecolor
=
'w'
,
edgecolor
=
'k'
)
#dgram.draw('focus ddx6', mode = 'label')
dgram
.
draw
(
'focus sorl1 dist'
,
mode
=
'vec'
)
plt
.
show
()
if
__name__
==
'__main__1'
:
dgram
=
Dendrogram
()
dgram
.
loadData
(
'/Users/mikael/simhome/CAGE_iPSC/de_promoter.tsv'
,
headers
=
False
)
d
=
euc
(
norm
(
dgram
.
Data
))
dgram
.
createBaseTree
(
d
,
dgram
.
Labels
)
dgram
.
saveBaseTree
(
'/Users/mikael/simhome/CAGE_iPSC/de_promoter.dgram'
)
#plt.figure(num=None, figsize=(5, 15), dpi=200, facecolor='w', edgecolor='k')
#dgram.draw()
#plt.show()
if
__name__
==
'__main__2'
:
dgram
=
Dendrogram
()
dgram
.
loadData
(
'/Users/mikael/simhome/monocyte/2016_03_22de.tsv'
,
headers
=
True
)
dgram
.
loadBaseTree
(
'/Users/mikael/simhome/monocyte/2016_03_22de.dgram'
)
dgram
.
normData
()
dgram
.
createView
(
'mindist=0.02'
,
mindist
=
0.02
)
dgram
.
createView
(
'focus nfkb2'
,
dgram
.
getRootByMembers
([
'p1@NFKB2'
,
'p2@NFKB2'
]),
mindist
=
0.05
)
enriched
=
dgram
.
getNodesByEnrichment
([
'p2@NFKB2'
,
'p3@NFKB2'
,
'p1@PDE4B'
,
'p2@PDE4B'
,
'p4@PDE4B'
,
'p6@PDE4B'
])
print
(
enriched
)
dgram
.
createView
(
'enriched_truncated'
,
mindist
=
enriched
[
0
]
.
dist
-
0.05
,
terminate_nodes
=
[
enriched
[
0
]])
dgram
.
setView
(
'focus nfkb2'
)
plt
.
figure
(
num
=
None
,
figsize
=
(
8
,
15
),
dpi
=
300
,
facecolor
=
'w'
,
edgecolor
=
'k'
)
dgram
.
draw
(
'focus nfkb2'
,
maxlabel
=
25
,
mode
=
'label'
,
thresholds
=
[
0.05
,
0.08
,
0.12
],
nodesRGB
=
[
enriched
])
plt
.
show
()
if
__name__
==
'__main__'
:
data
=
[[
1.0
,
1.0
,
1.0
],
[
2.0
,
2.0
,
1.9
],
[
2.1
,
2.1
,
2.1
],
[
3.1
,
3.2
,
3.3
],
[
3.2
,
3.4
,
3.6
],
[
3.5
,
3.4
,
3.3
],
[
0.9
,
0.9
,
1.0
]]
labels
=
[
'S1'
,
'S2a'
,
'S2b'
,
'S3a'
,
'S3b'
,
'S3c'
,
'S0'
]
dgram
=
Dendrogram
()
dgram
.
createBaseTree
(
euc
(
data
),
labels
)
dgram
.
draw
()
plt
.
show
()
if
__name__
==
'__main__4'
:
data
=
[[
1.0
,
1.0
,
1.0
],
[
2.0
,
2.0
,
1.9
],
[
2.1
,
2.1
,
2.1
],
[
3.1
,
3.2
,
3.3
],
[
3.2
,
3.4
,
3.6
],
[
3.5
,
3.4
,
3.3
],
[
0.9
,
0.9
,
1.0
]]
labels
=
[
'S1'
,
'S2a'
,
'S2b'
,
'S3a'
,
'S3b'
,
'S3c'
,
'S0'
]
dgram
=
Dendrogram
()
dgram
.
loadBaseTree
(
'/Users/mikael/simhome/monocyte/small.dgram'
)
dgram
.
setData
(
data
,
labels
)
myroot
=
dgram
.
getRootByMembers
([
'S0'
,
'S2a'
])
print
(
myroot
)
dgram
.
createViewFromRoot
(
'nroot'
,
myroot
)
dgram
.
draw
()
plt
.
show
()
heap.py
0 → 100644
View file @
bd07c60d
import
numpy
as
np
class
LabelHeap
:
"""
Min and max heap: data structure for keeping a list of labels, sorted by a value associated with each.
Based on max heap in Necaise, "Data structures and algorithms in Python" (Ch 13); fixed a bunch of bugs though...
"""
def
__init__
(
self
,
maxsize
,
reverse
=
False
):
"""
Initialise a heap.
:param maxsize: the maximum size of the heap
:param reverse: heap in descending order if true, else ascending
"""
self
.
reverse
=
reverse
self
.
_elements
=
np
.
array
([
None
for
_
in
range
(
maxsize
)])
self
.
_idx2val
=
dict
()
self
.
_count
=
0
def
__len__
(
self
):
"""
The number of elements in the heap currently.
:return: the number of added elements
"""
return
self
.
_count
def
__str__
(
self
):
"""
String representation of heap. A list of labels in a binary tree (first element is the smallest/greatest value)
:return: heap as a string
"""
return
str
([
y
for
y
in
self
.
_elements
[:
self
.
_count
]])
def
__repr__
(
self
):
return
self
.
__str__
()
def
capacity
(
self
):
"""
Maximum size allocated to heap
:return: the number of elements that this heap can store
"""
return
len
(
self
.
_elements
)
def
__getitem__
(
self
,
i
):
"""
Retrieve the value by tree index (index 0 is the root and contains the smallest/greatest value)
:param i: index in tree
:return: the value at this index
"""
return
self
.
_idx2val
[
self
.
_elements
[
i
]]
def
add
(
self
,
label
,
value
):
"""
Add a label with value to heap
:param label:
:param value:
"""
assert
self
.
_count
<
self
.
capacity
(),
"Cannot add to a full heap"
assert
not
label
in
self
.
_idx2val
,
"Cannot add a duplicate label"
self
.
_elements
[
self
.
_count
]
=
label
self
.
_idx2val
[
label
]
=
value
self
.
_count
+=
1
self
.
_siftUp
(
self
.
_count
-
1
)
def
pop
(
self
):
"""
Pop the (label, value) pair with minimum/maximum value; removes the entry
:return: tuple with label and value
"""
assert
self
.
_count
>
0
,
"Cannot extract from an empty heap"
label
=
self
.
_elements
[
0
]
self
.
_count
-=
1
self
.
_elements
[
0
]
=
self
.
_elements
[
self
.
_count
]
self
.
_siftDown
(
0
)
return
(
label
,
self
.
_idx2val
[
label
])
def
peek
(
self
):
"""
Peek the (label, value) pair with minimum/maximum value; does not change the heap
:return: tuple with label and value
"""
assert
self
.
_count
>
0
,
"Cannot peek in an empty heap"
return
(
self
.
_elements
[
0
],
self
.
_idx2val
[
self
.
_elements
[
0
]])
def
_delete
(
self
,
i
):
"""
Delete by internal, binary tree index
:param i: index
:return:
"""
assert
self
.
_count
>
i
,
"Cannot delete index"
+
str
(
i
)
self
.
_count
-=
1
self
.
_elements
[
i
]
=
self
.
_elements
[
self
.
_count
]
self
.
_siftDown
(
i
)
def
_siftUp
(
self
,
i
):
if
i
>
0
:
parent
=
(
i
-
1
)
//
2
if
(
self
[
i
]
>
self
[
parent
]
if
self
.
reverse
else
self
[
i
]
<
self
[
parent
]):
# swap
tmp
=
self
.
_elements
[
i
]
self
.
_elements
[
i
]
=
self
.
_elements
[
parent
]
self
.
_elements
[
parent
]
=
tmp
self
.
_siftUp
(
parent
)
def
_siftDown
(
self
,
i
):
left
=
2
*
i
+
1
right
=
2
*
i
+
2
extremist
=
i
if
left
<
self
.
_count
and
(
self
[
left
]
>=
self
[
extremist
]
if
self
.
reverse
else
self
[
left
]
<=
self
[
extremist
]):
extremist
=
left
if
right
<
self
.
_count
and
(
self
[
right
]
>=
self
[
extremist
]
if
self
.
reverse
else
self
[
right
]
<=
self
[
extremist
]):
extremist
=
right
if
extremist
!=
i
:
# swap
tmp
=
self
.
_elements
[
i
]
self
.
_elements
[
i
]
=
self
.
_elements
[
extremist
]
self
.
_elements
[
extremist
]
=
tmp
self
.
_siftDown
(
extremist
)
\ No newline at end of file
ival.py
View file @
bd07c60d
...
...
@@ -97,7 +97,7 @@ class IntervalTree:
def
putAll
(
self
,
tree
):
for
i
in
tree
:
self
.
put
(
i
.
getInterval
(),
tree
.
get
(
i
.
getInterval
()))
self
.
put
(
i
.
getInterval
(),
tree
.
__getitem__
(
i
.
getInterval
()))
def
_randomizedInsert
(
self
,
node
,
ival
,
value
):
if
node
==
None
:
return
IntervalNode
(
ival
,
value
)
...
...
phylo.py
View file @
bd07c60d
...
...
@@ -150,6 +150,8 @@ class PhyloNode:
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
_verbose
=
True
def
__init__
(
self
,
parent
=
None
,
label
=
''
):
""" Initialise a node.
Set its parent (another PhyloNode), parent can be None.
...
...
@@ -183,7 +185,8 @@ class PhyloNode:
for
i
in
range
(
self
.
nChildren
()):
stubs
[
i
]
=
str
(
self
.
children
[
i
])
if
self
.
dist
or
self
.
dist
==
0.0
:
dist
=
':'
+
str
(
self
.
dist
)
if
self
.
dist
==
0.0
:
dist
=
''
else
:
dist
=
':'
+
'
%5.3
f'
%
self
.
dist
if
self
.
label
!=
None
:
label
=
str
(
self
.
label
)
if
self
.
nChildren
()
==
0
:
...
...
@@ -277,6 +280,7 @@ class PhyloNode:
else
:
self
.
seqscores
=
[[
0
if
a
==
sym
else
999999
for
a
in
aln
.
alphabet
]
for
sym
in
self
.
sequence
]
# if we want to weight scores, this would need to change
if
self
.
_verbose
:
print
(
'Forward:'
,
self
.
label
,
'
\n\t
'
,
self
.
seqscores
)
return
self
.
seqscores
def
_backwardParsimony
(
self
,
aln
,
seq
=
None
):
...
...
@@ -314,6 +318,7 @@ class PhyloNode:
col
+=
1
for
i
in
range
(
self
.
nChildren
()):
self
.
children
[
i
]
.
_backwardParsimony
(
aln
,
sequence
.
Sequence
(
childbuf
[
i
],
aln
.
alphabet
,
self
.
children
[
i
]
.
label
or
"Child of "
+
self
.
label
,
gappy
=
True
))
if
self
.
_verbose
:
print
(
'Backward:'
,
self
.
label
,
'
\n\t
'
,
self
.
backptr
)
return
self
.
sequence
def
getSequence
(
self
):
...
...
@@ -394,7 +399,6 @@ class PhyloNode:
Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
----------------------------------------------------------------------------------------"""
def
runUPGMA
(
aln
,
measure
,
absoluteDistances
=
False
):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
...
...
@@ -403,6 +407,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
D
=
{}
N
=
{}
# The number of sequences in each node
M
=
aln
.
calcDistances
(
measure
)
# determine all pairwise distances
print
(
M
)
nodes
=
[
PhyloNode
(
label
=
seq
.
name
)
for
seq
in
aln
.
seqs
]
# construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for
i
in
range
(
len
(
nodes
)):
...
...
@@ -411,16 +416,17 @@ def runUPGMA(aln, measure, absoluteDistances=False):
N
[
nodes
[
i
]]
=
1
# each cluster contains a single sequence
for
j
in
range
(
0
,
i
):
D
[
frozenset
([
nodes
[
i
],
nodes
[
j
]])]
=
M
[
i
,
j
]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
""" Treat each node as a cluster, until there is only one cluster left, find the *closest*
pair of clusters, and merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a si
gn
le below
while
len
(
N
)
>
1
:
# N will contain all "live" clusters, to be reduced to a si
ng
le below
closest_pair
=
(
None
,
None
)
# The two nodes that are closest to one another according to supplied metric
closest_dist
=
None
# The distance between them
print
(
len
(
N
),
'nodes remain'
)
for
pair
in
D
:
# check all pairs which should be merged
dist
=
D
[
pair
]
pair_as_list
=
list
(
pair
)
print
(
'Inspecting
\"
'
+
str
(
pair_as_list
[
0
])
+
'
\"
and
\"
'
+
str
(
pair_as_list
[
1
])
+
'
\"
at distance
%5.3
f'
%
D
[
pair
])
if
closest_dist
==
None
or
dist
<
closest_dist
:
closest_dist
=
dist
closest_pair
=
list
(
pair
)
...
...
@@ -428,21 +434,23 @@ def runUPGMA(aln, measure, absoluteDistances=False):
x
=
closest_pair
[
0
]
# See Zvelebil and Baum p. 278 for notation
y
=
closest_pair
[
1
]
z
=
PhyloNode
()
# create a new node for the cluster z
z
.
dist
=
D
.
pop
(
frozenset
([
x
,
y
]))
/
2.0
# assign the absolute distance,
travelled so far, note: this will
change to relative distance later
z
.
dist
=
D
.
pop
(
frozenset
([
x
,
y
]))
/
2.0
# assign the absolute distance, change to relative distance later
Nx
=
N
.
pop
(
x
)
# find number of sequences in x, remove the cluster from list N
Ny
=
N
.
pop
(
y
)
# find number of sequences in y, remove the cluster from list N
dz
=
{}
# new distances to cluster z
x
.
parent
=
z
y
.
parent
=
z
z
.
children
=
[
x
,
y
]
print
(
'Closest pair is
\"
'
+
str
(
x
)
+
'
\"
('
+
str
(
Nx
)
+
') and
\"
'
+
str
(
y
)
+
'
\"
('
+
str
(
Ny
)
+
') at distance
%5.3
f'
%
(
z
.
dist
*
2
),
'form new node '
+
str
(
z
))
for
w
in
N
:
# for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw
=
D
.
pop
(
frozenset
([
x
,
w
]))
# retrieve and remove distance from D: x to w
dyw
=
D
.
pop
(
frozenset
([
y
,
w
]))
# retrieve and remove distance from D: y to w
dz
[
w
]
=
(
Nx
*
dxw
+
Ny
*
dyw
)
/
(
Nx
+
Ny
)
# distance: z to w
print
(
str
(
z
)
+
' gets distance to
\"
'
+
str
(
w
)
+
'
\"
: ('
,
Nx
,
'*
%5.3
f'
%
dxw
,
'+'
,
Ny
,
'*
%5.3
f'
%
dyw
,
') / ('
,
Nx
,
'+'
,
Ny
,
') =
%5.3
f'
%
dz
[
w
])
N
[
z
]
=
Nx
+
Ny
# total number of sequences in new cluster, insert new cluster in list N
for
w
in
dz
:
# we have to run through the nodes again, now not including the removed x and y
D
[
frozenset
([
z
,
w
])]
=
dz
[
w
]
# for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
x
.
parent
=
z
y
.
parent
=
z
z
.
children
=
[
x
,
y
]
nodes
.
append
(
z
)
if
not
absoluteDistances
:
x
.
_propagateDistance
(
z
.
dist
)
# convert absolute distances to relative by recursing down left path
...
...
@@ -595,7 +603,17 @@ if __name__ == '__main__1':
print
(
tree
)
if
__name__
==
'__main__'
:
tree
=
readNewick
(
'/Users/mikael/simhome/ASR/parsitest.nwk'
)
tree
.
putAlignment
(
sequence
.
Alignment
(
sequence
.
readFastaFile
(
'/Users/mikael/simhome/ASR/parsitest.aln'
,
sequence
.
DNA_Alphabet
)))
aln
=
sequence
.
readFastaFile
(
'/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.aln'
,
sequence
.
Protein_Alphabet
)
tree
=
runUPGMA
(
sequence
.
Alignment
(
aln
),
"fractional"
)
writeNewickFile
(
'/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.nwk'
,
tree
)
if
__name__
==
'__main__3'
:
aln
=
sequence
.
readClustalFile
(
'/Users/mikael/simhome/ASR/dp16_example.aln'
,
sequence
.
Protein_Alphabet
)
tree
=
runUPGMA
(
aln
,
"poisson"
)
writeNewickFile
(
'/Users/mikael/simhome/ASR/dp16_example_UPGMA.nwk'
,
tree
)
if
__name__
==
'__main__4'
:
tree
=
readNewick
(
'/Users/mikael/simhome/ASR/parsitest2.nwk'
)
tree
.
putAlignment
(
sequence
.
Alignment
(
sequence
.
readFastaFile
(
'/Users/mikael/simhome/ASR/parsitest2.aln'
,
sequence
.
DNA_Alphabet
)))
tree
.
parsimony
()
print
(
tree
.
strSequences
())
\ No newline at end of file
sequence.py
View file @
bd07c60d
...
...
@@ -269,12 +269,13 @@ def parseDefline(string):
"""
if
len
(
string
)
==
0
:
return
(
''
,
''
,
''
,
''
)
s
=
string
.
split
()[
0
]
if
re
.
match
(
"^sp
\
|[A-Z][A-Z0-9]
{5}
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
2
],
arg
[
0
],
''
)
if
re
.
match
(
"^sp
\
|[A-Z][A-Z0-9]
*
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
2
],
arg
[
0
],
''
)
elif
re
.
match
(
"^tr
\
|[A-Z][A-Z0-9]*
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
2
],
arg
[
0
],
''
)
elif
re
.
match
(
"^gi
\
|[0-9]*
\
|
\
S+
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
3
],
arg
[
0
],
arg
[
2
])
elif
re
.
match
(
"gb
\
|
\
S+
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
2
],
arg
[
0
],
''
)
elif
re
.
match
(
"emb
\
|
\
S+
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
2
],
arg
[
0
],
''
)
elif
re
.
match
(
"^refseq
\
|
\
S+
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
2
],
arg
[
0
],
''
)
elif
re
.
match
(
"[A-Z][A-Z0-9]*
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
0
],
arg
[
1
],
'UniProt'
,
''
)
# assume this is UniProt
else
:
return
(
s
,
''
,
''
,
''
)
def
readFastaFile
(
filename
,
alphabet
=
None
,
ignore
=
False
,
gappy
=
False
,
parse_defline
=
True
):
...
...
@@ -849,7 +850,7 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1):
# that ends at sequence indices i and j, for A and B, resp.)
for
i
in
range
(
1
,
lenA
+
1
):
for
j
in
range
(
1
,
lenB
+
1
):
match
=
S
[
i
-
1
,
j
-
1
]
+
substMatrix
.
get
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
match
=
S
[
i
-
1
,
j
-
1
]
+
substMatrix
.
__getitem__
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
fromTop
=
S
[
i
-
1
,
j
]
+
gap
fromLeft
=
S
[
i
,
j
-
1
]
+
gap
S
[
i
,
j
]
=
max
([
match
,
fromTop
,
fromLeft
])
...
...
@@ -908,7 +909,7 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1):
# that ends at sequence indices i and j, for A and B, resp.)
for
i
in
range
(
1
,
lenA
+
1
):
for
j
in
range
(
1
,
lenB
+
1
):
match
=
S
[
i
-
1
,
j
-
1
]
+
substMatrix
.
get
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
match
=
S
[
i
-
1
,
j
-
1
]
+
substMatrix
.
__getitem__
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
fromTop
=
S
[
i
-
1
,
j
]
+
gap
fromLeft
=
S
[
i
,
j
-
1
]
+
gap
S
[
i
,
j
]
=
max
([
match
,
fromTop
,
fromLeft
,
0
])
# Local: add option that we re-start alignment from "0"
...
...
@@ -967,12 +968,12 @@ def tripletAlignGlobal(seqA, seqB, seqC, subsMatrix, gap = -1):
for
j
in
range
(
1
,
lenB
+
1
):
for
k
in
range
(
1
,
lenC
+
1
):
# Scored using sum-of-pairs
matchABC
=
S
[
i
-
1
,
j
-
1
,
k
-
1
]
+
subsMatrix
.
get
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
\
+
subsMatrix
.
get
(
seqA
[
i
-
1
],
seqC
[
k
-
1
])
\
+
subsMatrix
.
get
(
seqB
[
j
-
1
],
seqC
[
k
-
1
])
matchAB
=
S
[
i
-
1
,
j
-
1
,
k
]
+
2
*
gap
+
subsMatrix
.
get
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
matchBC
=
S
[
i
,
j
-
1
,
k
-
1
]
+
2
*
gap
+
subsMatrix
.
get
(
seqB
[
j
-
1
],
seqC
[
k
-
1
])
matchAC
=
S
[
i
-
1
,
j
,
k
-
1
]
+
2
*
gap
+
subsMatrix
.
get
(
seqA
[
i
-
1
],
seqC
[
k
-
1
])
matchABC
=
S
[
i
-
1
,
j
-
1
,
k
-
1
]
+
subsMatrix
.
__getitem__
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
\
+
subsMatrix
.
__getitem__
(
seqA
[
i
-
1
],
seqC
[
k
-
1
])
\
+
subsMatrix
.
__getitem__
(
seqB
[
j
-
1
],
seqC
[
k
-
1
])
matchAB
=
S
[
i
-
1
,
j
-
1
,
k
]
+
2
*
gap
+
subsMatrix
.
__getitem__
(
seqA
[
i
-
1
],
seqB
[
j
-
1
])
matchBC
=
S
[
i
,
j
-
1
,
k
-
1
]
+
2
*
gap
+
subsMatrix
.
__getitem__
(
seqB
[
j
-
1
],
seqC
[
k
-
1
])
matchAC
=
S
[
i
-
1
,
j
,
k
-
1
]
+
2
*
gap
+
subsMatrix
.
__getitem__
(
seqA
[
i
-
1
],
seqC
[
k
-
1
])
gapAB
=
S
[
i
,
j
,
k
-
1
]
+
3
*
gap
gapBC
=
S
[
i
-
1
,
j
,
k
]
+
3
*
gap
gapAC
=
S
[
i
,
j
-
1
,
k
]
+
3
*
gap
...
...
sstruct.py
View file @
bd07c60d
...
...
@@ -26,8 +26,6 @@ cf_dict = { # Chou-Fasman table
'T'
:
(
83
,
119
,
96
,
0.086
,
0.108
,
0.065
,
0.079
),
# Threonine
'W'
:
(
108
,
137
,
96
,
0.077
,
0.013
,
0.064
,
0.167
),
# Tryptophan
'Y'
:
(
69
,
147
,
114
,
0.082
,
0.065
,
0.114
,
0.125
),
# Tyrosine
'V'
:
(
106
,
170
,
50
,
0.062
,
0.048
,
0.028
,
0.053
),
# Valine
'Y'
:
(
69
,
147
,
114
,
0.082
,
0.065
,
0.114
,
0.125
),
# Tyrosine
'V'
:
(
106
,
170
,
50
,
0.062
,
0.048
,
0.028
,
0.053
),}
# Valine
prot_alpha
=
sym
.
Protein_Alphabet
...
...
test_hca.py
0 → 100644
View file @
bd07c60d
import
unittest
from
hca
import
*
import
random
class
MyTestCase
(
unittest
.
TestCase
):
N
=
8
def
setUp
(
self
):
""" Set up for each test """
self
.
pairidxs1
=
dict
()
y
=
0
for
i
in
range
(
self
.
N
):
for
j
in
range
(
i
+
1
,
self
.
N
):
self
.
pairidxs1
[(
i
,
j
)]
=
y
y
+=
1
self
.
pairidxs2
=
dict
()
for
i
in
range
(
self
.
N
):
for
j
in
range
(
0
,
i
):
self
.
pairidxs2
[(
i
,
j
)]
=
self
.
pairidxs1
[(
j
,
i
)]
def
test_PairArray1
(
self
):
pa1
=
PairArray
(
self
.
N
)
pa2
=
PairArray
(
self
.
N
)
for
p
in
self
.
pairidxs1
:
pa1
[
p
]
=
self
.
pairidxs1
[
p
]
for
p
in
self
.
pairidxs2
:
pa2
[
p
]
=
self
.
pairidxs2
[
p
]
for
(
i
,
j
)
in
self
.
pairidxs1
:
self
.
assertEqual
(
pa1
[(
j
,
i
)],
self
.
pairidxs1
[(
i
,
j
)])
for
(
i
,
j
)
in
self
.
pairidxs2
:
self
.
assertEqual
(
pa2
[(
j
,
i
)],
pa1
[(
j
,
i
)])
def
test_DNode1
(
self
):
layer0
=
[
DNode
(
i
)
for
i
in
range
(
0
,
10
)]
layer1
=
[]
for
i
in
range
(
0
,
len
(
layer0
)
//
2
):
layer1
.
append
(
DNode
(
i
+
len
(
layer0
),
children
=
[
layer0
[
i
*
2
],
layer0
[
i
*
2
+
1
]],
dist
=
random
.
randint
(
1
,
10
)))
root
=
DNode
(
len
(
layer0
)
+
len
(
layer1
),
layer1
,
dist
=
100
)
self
.
assertEquals
(
root
.
nChildren
(),
len
(
layer1
))
self
.
assertEquals
(
len
(
root
.
getLeaves
()),
len
(
layer0
))
for
i
in
range
(
len
(
layer1
)):
self
.
assertEquals
(
layer1
[
i
]
.
nChildren
(),
2
)
for
i
in
range
(
len
(
layer0
)):
self
.
assertEquals
(
layer0
[
i
]
.
nChildren
(),
0
)
def
test_DNode2
(
self
):
layer0
=
[
DNode
(
i
)
for
i
in
range
(
0
,
10
)]
layer1
=
[]
for
i
in
range
(
0
,
len
(
layer0
)
//
2
):
layer1
.
append
(
DNode
(
i
+
len
(
layer0
),
children
=
[
layer0
[
i
*
2
],
layer0
[
i
*
2
+
1
]],
dist
=
random
.
randint
(
1
,
10
)))
root1
=
DNode
(
len
(
layer0
)
+
len
(
layer1
),
layer1
,
dist
=
100
)
s1
=
str
(
root1
)
root2
=
parse
(
s1
)
self
.
assertEquals
(
root2
.
nChildren
(),
root1
.
nChildren
())
self
.
assertEquals
(
len
(
root2
.
getLeaves
()),
len
(
root1
.
getLeaves
()))
s2
=
str
(
root2
)
root3
=
parse
(
s2
)
self
.
assertEquals
(
str
(
root3
),
s2
)
def
test_DNode3
(
self
):
layer0
=
[
DNode
(
i
)
for
i
in
range
(
0
,
8
)]
layer1
=
[]
for
i
in
range
(
0
,
len
(
layer0
)
//
2
):
layer1
.
append
(
DNode
(
i
+
len
(
layer0
),
children
=
[
layer0
[
i
*
2
],
layer0
[
i
*
2
+
1
]],
dist
=
random
.
randint
(
1
,
10
)))
layer2
=
[]
for
i
in
range
(
0
,
len
(
layer1
)
//
2
):
layer2
.
append
(
DNode
(
i
+
len
(
layer0
)
+
len
(
layer1
),
children
=
[
layer1
[
i
*
2
],
layer1
[
i
*
2
+
1
]],
dist
=
random
.
randint
(
11
,
20
)))
root
=
DNode
(
len
(
layer0
)
+
len
(
layer1
)
+
len
(
layer2
),
layer2
,
dist
=
30
)
chars
=
'ABCDEFGHIJKLMNOP'
labels_list
=
[
ch
for
ch
in
chars
]
root1
=
parse
(
root
.
newick
(
labels_list
))
labels_rev
=
[
ch
for
ch
in
chars
[::
-
1
]]
labels_off1
=
[
ch
for
ch
in
chars
[
1
:]]
labels_dict
=
{}
for
i
in
range
(
len
(
labels_list
)):
labels_dict
[
i
]
=
labels_list
[
i
]
root2
=
parse
(
root
.
newick
(
labels_dict
))
self
.
assertEquals
(
len
(
parse
(
root
.
newick
(
labels_rev
))
.
getLeaves
()),
len
(
root
.
getLeaves
()))
self
.
assertEquals
(
root
.
newick
(
labels_dict
),
root
.
newick
(
labels_list
))
for
ch
in
chars
[:
-
1
]:
# all chars except last one
node1
=
root1
.
findNode
(
ch
)
node2
=
root2
.
findNode
(
ch
)
self
.
assertIsNotNone
(
node1
)
self
.
assertIsNotNone
(
node2
)
self
.
assertEquals
(
len
(
node1
.
getLeaves
()),
len
(
node2
.
getLeaves
()))
self
.
assertEquals
(
str
(
root1
.
findNode
(
ch
)),
str
(
root2
.
findNode
(
ch
)))
def
test_DNode4
(
self
):
pass
if
__name__
==
'__main__'
:
unittest
.
main
()
test_heap.py
0 → 100644
View file @
bd07c60d
import
unittest
from
heap
import
*
import
random
class
MyTestCase
(
unittest
.
TestCase
):
def
setUp
(
self
):
""" Set up for each test """
idxs
=
[
i
for
i
in
range
(
random
.
randint
(
0
,
10
),
random
.
randint
(
10
,
50
))]
random
.
shuffle
(
idxs
)
self
.
a
=
[(
idx
,
random
.
random
())
for
idx
in
idxs
]
self
.
mh
=
LabelHeap
(
len
(
self
.
a
))
self
.
maxh
=
LabelHeap
(
len
(
self
.
a
),
reverse
=
True
)
for
(
address
,
value
)
in
self
.
a
:
self
.
mh
.
add
(
address
,
value
)
self
.
maxh
.
add
(
address
,
value
)
def
test_MinHeap1
(
self
):
self
.
assertEqual
(
len
(
self
.
mh
),
len
(
self
.
a
))
def
test_MinHeap2
(
self
):
minidx
=
0
for
i
in
range
(
1
,
len
(
self
.
a
)):
if
self
.
a
[
i
][
1
]
<
self
.
a
[
minidx
][
1
]:
minidx
=
i
#print(self.mh._elements[0], self.mh[0])
(
address
,
value
)
=
self
.
mh
.
pop
()
self
.
assertEqual
(
address
,
self
.
a
[
minidx
][
0
])
self
.
assertEqual
(
value
,
self
.
a
[
minidx
][
1
])
def
test_MinHeap3
(
self
):
ys
=
[
y
[
1
]
for
y
in
self
.
a
]
ys
.
sort
(
reverse
=
False
)
for
y
in
ys
:
self
.
assertEqual
(
y
,
self
.
mh
[
0
])
self
.
mh
.
pop
()
def
test_MaxHeap3
(
self
):
ys
=
[
y
[
1
]
for
y
in
self
.
a
]
ys
.
sort
(
reverse
=
True
)
for
y
in
ys
:
self
.
assertEqual
(
y
,
self
.
maxh
[
0
])
self
.
maxh
.
pop
()
def
test_MinHeap4
(
self
):
mh1
=
LabelHeap
(
10
)
self
.
assertEquals
(
len
(
mh1
),
0
)
mh1
.
add
(
'a'
,
2
)
self
.
assertEquals
(
len
(
mh1
),
1
)
mh1
.
add
(
'b'
,
1
)
self
.
assertEquals
(
len
(
mh1
),
2
)
(
label
,
y
)
=
mh1
.
pop
()
self
.
assertEquals
(
label
,
'b'
)
self
.
assertEquals
(
len
(
mh1
),
1
)
mh1
.
add
(
'c'
,
3
)
self
.
assertEquals
(
len
(
mh1
),
2
)
if
__name__
==
'__main__'
:
unittest
.
main
()
webservice.py
View file @
bd07c60d
...
...
@@ -176,7 +176,7 @@ def getGODef(goterm):
goterm: the identifier, e.g. 'GO:0002080'
"""
# first turn off server certificate verification
if
(
not
os
.
environ
.
get
(
'PYTHONHTTPSVERIFY'
,
''
)
and
getattr
(
ssl
,
'_create_unverified_context'
,
None
)):
if
(
not
os
.
environ
.
__getitem__
(
'PYTHONHTTPSVERIFY'
,
''
)
and
getattr
(
ssl
,
'_create_unverified_context'
,
None
)):
ssl
.
_create_default_https_context
=
ssl
.
_create_unverified_context
# Construct URL with query term
url
=
__ebiGOUrl__
+
'ontology/go/search?query='
+
goterm
...
...
@@ -225,7 +225,7 @@ def getGOTerms(genes):
# Construct URL
# Get the entry: fill in the fields specified below
# first turn off server certificate verification
if
(
not
os
.
environ
.
get
(
'PYTHONHTTPSVERIFY'
,
''
)
and
getattr
(
ssl
,
'_create_unverified_context'
,
None
)):
if
(
not
os
.
environ
.
__getitem__
(
'PYTHONHTTPSVERIFY'
,
''
)
and
getattr
(
ssl
,
'_create_unverified_context'
,
None
)):
ssl
.
_create_default_https_context
=
ssl
.
_create_unverified_context
page
=
1
try
:
...
...
@@ -234,7 +234,7 @@ def getGOTerms(genes):
urlreq
=
urllib
.
request
.
Request
(
url
)
urlreq
.
add_header
(
'Accept-encoding'
,
'gzip'
)
response
=
urllib
.
request
.
urlopen
(
urlreq
)
if
response
.
info
()
.
get
(
'Content-Encoding'
)
==
'gzip'
:
if
response
.
info
()
.
__getitem__
(
'Content-Encoding'
)
==
'gzip'
:
buf
=
StringIO
(
response
.
read
())
f
=
gzip
.
GzipFile
(
fileobj
=
buf
)
data
=
f
.
read
()
.
decode
(
"utf-8"
)
...
...
@@ -285,7 +285,7 @@ def getGenes(goterms, taxo=None):
term
=
termbatch
[
i
]
uri_string
+=
term
+
","
if
i
<
len
(
termbatch
)
-
1
else
term
# first turn off server certificate verification
if
(
not
os
.
environ
.
get
(
'PYTHONHTTPSVERIFY'
,
''
)
and
getattr
(
ssl
,
'_create_unverified_context'
,
None
)):
if
(
not
os
.
environ
.
__getitem__
(
'PYTHONHTTPSVERIFY'
,
''
)
and
getattr
(
ssl
,
'_create_unverified_context'
,
None
)):
ssl
.
_create_default_https_context
=
ssl
.
_create_unverified_context
page
=
1
try
:
...
...
@@ -294,7 +294,7 @@ def getGenes(goterms, taxo=None):
urlreq
=
urllib
.
request
.
Request
(
url
)
urlreq
.
add_header
(
'Accept-encoding'
,
'gzip'
)
response
=
urllib
.
request
.
urlopen
(
urlreq
)
if
response
.
info
()
.
get
(
'Content-Encoding'
)
==
'gzip'
:
if
response
.
info
()
.
__getitem__
(
'Content-Encoding'
)
==
'gzip'
:
buf
=
StringIO
(
response
.
read
())
f
=
gzip
.
GzipFile
(
fileobj
=
buf
)
data
=
f
.
read
()
.
decode
(
"utf-8"
)
...
...
@@ -534,7 +534,7 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
request
=
urllib
.
request
.
Request
(
url
,
data
)
opener
=
urllib
.
request
.
build_opener
()
response
=
opener
.
open
(
request
)
page
=
response
.
read
(
200000
)
.
decode
(
'utf-8'
)
page
=
response
.
read
(
200000
00
)
.
decode
(
'utf-8'
)
up_dict
=
{}
# For each record we retrieve, split the line by tabs and build up the UniProt dict
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment