Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
binfpy
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
opensource
binfpy
Commits
9889428a
Commit
9889428a
authored
Jan 07, 2019
by
Mikael Boden
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added_Poisson
parent
e1db8f84
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
163 additions
and
12 deletions
+163
-12
gtf.py
gtf.py
+21
-3
prob.py
prob.py
+129
-1
sequence.py
sequence.py
+13
-8
No files found.
gtf.py
View file @
9889428a
...
...
@@ -64,13 +64,13 @@ class GtfFile:
See http://genome.ucsc.edu/FAQ/FAQformat#format1
"""
def
__init__
(
self
,
entries
):
def
__init__
(
self
,
entries
,
filter_feature
=
None
):
"""
Create a GtfFile instance.
:param entries: an iterable of entries or a filename
"""
if
isinstance
(
entries
,
str
):
# filename
self
.
chroms
=
readGtfFile
(
entries
)
self
.
chroms
=
readGtfFile
(
entries
,
filter_feature
)
else
:
self
.
chroms
=
dict
()
for
entry
in
entries
:
...
...
@@ -157,8 +157,24 @@ class GtfFile:
elif
len
(
all
)
==
0
:
return
None
else
:
return
next
(
iter
(
all
))
def
readGtfFile
(
filename
):
def
getIndex
(
self
,
group_attribute
):
"""
Create a dictionary for a specified attribute in the group list, e.g. "gene_id", "gene_name" or "transcript_id"
:param group_attribute:
:return: a dictionary keyed by the values of the nominated group_attribute,
pointing to the entry in the chromosome interval-tree
"""
d
=
{}
for
entry
in
self
:
if
group_attribute
in
entry
.
attr
:
# is the entry indexed with the attribute
if
not
entry
.
attr
[
group_attribute
]
in
d
:
# only add the first entry for this particular value
d
[
entry
.
attr
[
group_attribute
]]
=
entry
return
d
def
readGtfFile
(
filename
,
filter_feature
=
None
):
""" Read a GTF/GFF file.
filename: name of file
filter_feature: name of feature to be selected, all others ignored; None means anything
"""
f
=
open
(
filename
)
row
=
0
...
...
@@ -180,6 +196,8 @@ def readGtfFile(filename):
seqname
=
words
[
0
]
source
=
words
[
1
]
feature
=
words
[
2
]
if
filter_feature
and
filter_feature
!=
feature
:
continue
start
=
int
(
words
[
3
])
end
=
int
(
words
[
4
])
score
=
None
...
...
prob.py
View file @
9889428a
...
...
@@ -5,7 +5,6 @@ Uses and depends on "Alphabet" that is used to define discrete random variables.
'''
import
random
from
sym
import
*
from
copy
import
deepcopy
import
math
#################################################################################################
...
...
@@ -957,3 +956,132 @@ class HMM():
fh
.
write
(
html
)
fh
.
close
()
return
html
#################################################################################################
# The univariate Gaussian density function.
#################################################################################################
class
Gaussian
():
mu
=
None
# mean
sigma
=
None
# standard deviation
sigmaSquared
=
None
# variance
def
ROOT_2PI
(
self
):
return
(
2
*
math
.
pi
)
**
2
def
LOG_ROOT_2PI
(
self
):
return
0.5
*
(
math
.
log
(
2
)
+
math
.
log
(
math
.
pi
))
def
__init__
(
self
,
mean
,
variance
):
""" Creates a univariate Gaussian distribution with the given fixed mean and variance. """
self
.
mu
=
mean
self
.
sigmaSquared
=
variance
;
self
.
sigma
=
math
.
sqrt
(
variance
);
self
.
normConst
=
self
.
sigma
*
self
.
ROOT_2PI
();
self
.
logNormConst
=
(
0.5
*
math
.
log
(
variance
))
+
self
.
LOG_ROOT_2PI
();
def
__str__
(
self
):
return
"<"
+
"
%5.3
f"
%
self
.
mu
+
u"
\u00B1
%5.3
f"
%
self
.
sigma
+
">"
def
getDensity
(
self
,
x
):
""" Returns the density of this Gaussian distribution at the given value. """
return
(
math
.
exp
(
-
math
.
pow
((
x
-
self
.
mu
),
2
)
/
(
2
*
self
.
sigmaSquared
))
/
self
.
normConst
);
def
__getitem__
(
self
,
value
):
""" Get the probability density of a specified value for this Gaussian """
return
self
.
getDensity
(
value
)
def
getMean
(
self
):
return
self
.
mu
def
getVariance
(
self
):
return
self
.
sigmaSquared
def
getLogDensity
(
self
,
x
):
""" Returns the natural log of the density of this Gaussian distribution at the given value. """
return
(
-
math
.
pow
((
x
-
self
.
mu
),
2
)
/
(
2
*
self
.
sigmaSquared
))
-
self
.
logNormConst
;
def
sample
(
self
):
""" Returns a value sampled from this Gaussian distribution. The implementation uses the Box - Muller transformation
[G.E.P.Box and M.E.Muller(1958) "A note on the generation of random normal deviates". Ann.Math.Stat 29: 610 - 611]. """
U
=
random
.
random
()
# get a random value between 0 and 1
V
=
random
.
random
()
# get a random value between 0 and 1
return
(
self
.
mu
+
(
self
.
sigma
*
math
.
sin
(
2
*
math
.
pi
*
V
)
*
math
.
sqrt
((
-
2
*
math
.
log
(
U
)))))
def
estimate
(
samples
,
count
=
None
):
""" Create a density based on the specified samples. Optionally, provide an iterable with the corresponding counts/weights. """
mean
=
0
if
count
==
None
:
for
i
in
range
(
len
(
samples
)):
mean
+=
samples
[
i
]
/
len
(
samples
)
diff
=
0
for
i
in
range
(
len
(
samples
)):
diff
+=
(
mean
-
samples
[
i
])
*
(
mean
-
samples
[
i
]);
if
(
diff
==
0
):
return
None
return
Gaussian
(
mean
,
diff
/
len
(
samples
))
elif
len
(
count
)
==
len
(
samples
):
totcnt
=
0
for
i
in
range
(
len
(
samples
)):
mean
+=
samples
[
i
]
*
count
[
i
]
totcnt
+=
count
[
i
]
mean
/=
totcnt
diff
=
0
for
i
in
range
(
len
(
samples
)):
diff
+=
(
mean
-
samples
[
i
])
*
(
mean
-
samples
[
i
])
*
count
[
i
];
if
(
diff
==
0
):
return
None
return
Gaussian
(
mean
,
diff
/
totcnt
)
#################################################################################################
# The univariate Poisson distribution
#################################################################################################
class
Poisson
():
def
__init__
(
self
,
LAMBDA
):
"""
* Define a Poisson distribution
* @ param lambda the average number of events per interval
"""
self
.
LAMBDA
=
LAMBDA
def
p
(
self
,
k
):
"""
* The probability mass function
* @param k the number of events in the interval
* @return the probability of k events
"""
return
math
.
exp
(
k
*
math
.
log
(
self
.
LAMBDA
)
-
self
.
LAMBDA
-
lgamma
(
k
+
1
))
def
cdf
(
self
,
k
):
"""
* The cumulative probability function.
* The implementation calls the PMF for all values i from 0 to floor(k)
* @param k the number of events in the interval
* @return the cumulative probability of k events
* https://en.wikipedia.org/wiki/Poisson_distribution
"""
sum
=
0
for
i
in
range
(
k
+
1
):
sum
+=
self
.
p
(
i
)
if
(
sum
>=
1.0
):
# happens only due to poor numerical precision
return
1.0
return
sum
def
lgamma
(
x
):
"""
* Returns an approximation of the log of the Gamma function of x. Laczos
* Approximation Reference: Numerical Recipes in C
* http://www.library.cornell.edu/nr/cbookcpdf.html
"""
cof
=
[
76.18009172947146
,
-
86.50532032941677
,
24.01409824083091
,
-
1.231739572450155
,
0.1208650973866179e-2
,
-
0.5395239384953e-5
]
y
=
x
tmp
=
x
+
5.5
tmp
-=
((
x
+
0.5
)
*
math
.
log
(
tmp
))
ser
=
1.000000000190015
for
j
in
range
(
len
(
cof
)):
y
+=
1
ser
+=
(
cof
[
j
]
/
y
)
return
(
-
tmp
+
math
.
log
(
2.5066282746310005
*
ser
/
x
))
sequence.py
View file @
9889428a
...
...
@@ -195,14 +195,15 @@ class Sequence(object):
Below are some useful methods for loading data from strings and files.
Recognize the FASTA format (nothing fancy).
"""
def
readFasta
(
string
,
alphabet
=
None
,
ignore
=
False
,
gappy
=
False
):
def
readFasta
(
string
,
alphabet
=
None
,
ignore
=
False
,
gappy
=
False
,
parse_defline
=
True
):
""" Read the given string as FASTA formatted data and return the list of
sequences contained within it.
If alphabet is specified, use it, if None (default) then guess it.
If ignore is False, errors cause the method to fail.
If ignore is True, errors will disregard sequence.
If gappy is False (default), sequence cannot contain gaps,
if True gaps are accepted and included in the resulting sequences."""
if True gaps are accepted and included in the resulting sequences.
If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
seqlist
=
[]
# list of sequences contained in the string
seqname
=
None
# name of *current* sequence
seqinfo
=
None
...
...
@@ -222,8 +223,11 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False):
seqinfo
=
line
[
1
:]
.
split
()
# skip first char
if
len
(
seqinfo
)
>
0
:
try
:
parsed
=
parseDefline
(
seqinfo
[
0
])
seqname
=
parsed
[
0
]
if
parse_defline
:
parsed
=
parseDefline
(
seqinfo
[
0
])
seqname
=
parsed
[
0
]
else
:
seqname
=
seqinfo
[
0
]
seqinfo
=
line
[
1
:]
except
IndexError
as
errmsg
:
if
not
ignore
:
...
...
@@ -266,14 +270,15 @@ def parseDefline(string):
elif
re
.
match
(
"^refseq
\
|
\
S+
\
|
\
S+"
,
s
):
arg
=
s
.
split
(
'|'
);
return
(
arg
[
1
],
arg
[
2
],
arg
[
0
],
''
)
else
:
return
(
s
,
''
,
''
,
''
)
def
readFastaFile
(
filename
,
alphabet
=
None
,
ignore
=
False
,
gappy
=
False
):
def
readFastaFile
(
filename
,
alphabet
=
None
,
ignore
=
False
,
gappy
=
False
,
parse_defline
=
True
):
""" Read the given FASTA formatted file and return the list of sequences
contained within it. Note that if alphabet is NOT specified, it will take a
separate guess for each sequence.
If ignore is False, errors cause the method to fail.
If ignore is True, errors will disregard sequence.
If gappy is False (default), sequence cannot contain gaps,
if True gaps are accepted and included in the resulting sequences."""
if True gaps are accepted and included in the resulting sequences.
If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
fh
=
open
(
filename
)
seqlist
=
[]
batch
=
''
# a batch of rows including one or more complete FASTA entries
...
...
@@ -282,7 +287,7 @@ def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
row
=
row
.
strip
()
if
len
(
row
)
>
0
:
if
row
.
startswith
(
'>'
)
and
rowcnt
>
0
:
more
=
readFasta
(
batch
,
alphabet
,
ignore
,
gappy
)
more
=
readFasta
(
batch
,
alphabet
,
ignore
,
gappy
,
parse_defline
)
if
len
(
more
)
>
0
:
seqlist
.
extend
(
more
)
batch
=
''
...
...
@@ -290,7 +295,7 @@ def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
batch
+=
row
+
'
\n
'
rowcnt
+=
1
if
len
(
batch
)
>
0
:
more
=
readFasta
(
batch
,
alphabet
,
ignore
,
gappy
)
more
=
readFasta
(
batch
,
alphabet
,
ignore
,
gappy
,
parse_defline
)
if
len
(
more
)
>
0
:
seqlist
.
extend
(
more
)
fh
.
close
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment