added_Poisson

9889428a · Mikael Boden · e1db8f84 · 9889428a · 9889428a · 9889428a
Commit 9889428a authored Jan 07, 2019 by Mikael Boden
Hide whitespace changes
Inline Side-by-side

Showing with 163 additions and 12 deletions

gtf.py gtf.py +21 -3

prob.py prob.py +129 -1

sequence.py sequence.py +13 -8

No files found.
--- a/gtf.py
+++ b/gtf.py
@@ -64,13 +64,13 @@ class GtfFile:
        See http://genome.ucsc.edu/FAQ/FAQformat#format1
    """

-    def __init__(self, entries):
+    def __init__(self, entries, filter_feature = None):
        """
        Create a GtfFile instance.
        :param entries: an iterable of entries or a filename
        """
        if isinstance(entries, str): # filename
-            self.chroms = readGtfFile(entries)
+            self.chroms = readGtfFile(entries, filter_feature)
        else:
            self.chroms = dict()
            for entry in entries:
@@ -157,8 +157,24 @@ class GtfFile:
        elif len(all) == 0: return None
        else: return next(iter(all))

-def readGtfFile(filename):
+    def getIndex(self, group_attribute):
+        """
+        Create a dictionary for a specified attribute in the group list, e.g. "gene_id", "gene_name" or "transcript_id"
+        :param group_attribute:
+        :return: a dictionary keyed by the values of the nominated group_attribute,
+        pointing to the entry in the chromosome interval-tree
+        """
+        d = {}
+        for entry in self:
+            if group_attribute in entry.attr: # is the entry indexed with the attribute
+                if not entry.attr[group_attribute] in d: # only add the first entry for this particular value
+                    d[entry.attr[group_attribute]] = entry
+        return d
+
+def readGtfFile(filename, filter_feature = None):
    """ Read a GTF/GFF file.
+    filename: name of file
+    filter_feature: name of feature to be selected, all others ignored; None means anything
    """
    f = open(filename)
    row = 0
@@ -180,6 +196,8 @@ def readGtfFile(filename):
            seqname = words[0]
            source = words[1]
            feature = words[2]
+            if filter_feature and filter_feature != feature:
+                continue
            start = int(words[3])
            end = int(words[4])
            score = None

--- a/prob.py
+++ b/prob.py
@@ -5,7 +5,6 @@ Uses and depends on "Alphabet" that is used to define discrete random variables.
 '''
 import random
 from sym import *
-from copy import deepcopy
 import math

 #################################################################################################
@@ -957,3 +956,132 @@ class HMM():
            fh.write(html)
            fh.close()
        return html
+
+#################################################################################################
+# The univariate Gaussian density function.
+#################################################################################################
+
+class Gaussian():
+
+    mu = None           # mean
+    sigma = None        # standard deviation
+    sigmaSquared = None # variance
+
+    def ROOT_2PI(self):
+        return (2 * math.pi) ** 2
+    def LOG_ROOT_2PI(self):
+        return 0.5 * (math.log(2) + math.log(math.pi))
+
+    def __init__(self, mean, variance):
+        """ Creates a univariate Gaussian distribution with the given fixed mean and variance. """
+        self.mu = mean
+        self.sigmaSquared = variance;
+        self.sigma = math.sqrt(variance);
+        self.normConst = self.sigma * self.ROOT_2PI();
+        self.logNormConst = (0.5 * math.log(variance)) + self.LOG_ROOT_2PI();
+
+    def __str__(self):
+        return "<" + "%5.3f" % self.mu + u"\u00B1%5.3f" % self.sigma + ">"
+
+    def getDensity(self, x):
+        """ Returns the density of this Gaussian distribution at the given value. """
+        return (math.exp(-math.pow((x - self.mu), 2) / (2 * self.sigmaSquared)) / self.normConst);
+
+    def __getitem__(self, value):
+        """ Get the probability density of a specified value for this Gaussian """
+        return self.getDensity(value)
+
+    def getMean(self):
+        return self.mu
+
+    def getVariance(self):
+        return self.sigmaSquared
+
+    def getLogDensity(self, x):
+        """ Returns the natural log of the density of this Gaussian distribution at the given value. """
+        return (-math.pow((x - self.mu), 2) / (2 * self.sigmaSquared)) - self.logNormConst;
+
+    def sample(self):
+        """ Returns a value sampled from this Gaussian distribution. The implementation uses the Box - Muller transformation
+         [G.E.P.Box and M.E.Muller(1958) "A note on the generation of random normal deviates". Ann.Math.Stat 29: 610 - 611]. """
+        U = random.random() # get a random value between 0 and 1
+        V = random.random() # get a random value between 0 and 1
+        return (self.mu + (self.sigma * math.sin(2 * math.pi * V) * math.sqrt((-2 * math.log(U)))))
+
+def estimate(samples, count = None):
+    """ Create a density based on the specified samples. Optionally, provide an iterable with the corresponding counts/weights. """
+    mean = 0
+    if count == None:
+        for i in range(len(samples)):
+            mean += samples[i] / len(samples)
+        diff = 0
+        for i in range(len(samples)):
+            diff += (mean - samples[i]) * (mean - samples[i]);
+        if (diff == 0):
+            return None
+        return Gaussian(mean, diff / len(samples))
+    elif len(count) == len(samples):
+        totcnt = 0
+        for i in range(len(samples)):
+            mean += samples[i] * count[i]
+            totcnt += count[i]
+        mean /= totcnt
+        diff = 0
+        for i in range(len(samples)):
+            diff += (mean - samples[i]) * (mean - samples[i]) * count[i];
+        if (diff == 0):
+            return None
+        return Gaussian(mean, diff / totcnt)
+
+#################################################################################################
+# The univariate Poisson distribution
+#################################################################################################
+
+class Poisson():
+
+    def __init__(self, LAMBDA):
+        """
+           * Define a Poisson distribution
+           * @ param lambda the average number of events per interval
+        """
+        self.LAMBDA = LAMBDA
+
+    def p(self, k):
+        """
+         * The probability mass function
+         * @param k the number of events in the interval
+         * @return the probability of k events
+        """
+        return math.exp(k * math.log(self.LAMBDA) - self.LAMBDA - lgamma(k + 1))
+
+    def cdf(self, k):
+        """
+         * The cumulative probability function.
+         * The implementation calls the PMF for all values i from 0 to floor(k)
+         * @param k the number of events in the interval
+         * @return the cumulative probability of k events
+         * https://en.wikipedia.org/wiki/Poisson_distribution
+        """
+        sum = 0
+        for i in range(k + 1):
+            sum += self.p(i)
+            if (sum >= 1.0): # happens only due to poor numerical precision
+                return 1.0
+        return sum
+
+
+def lgamma(x):
+    """
+     * Returns an approximation of the log of the Gamma function of x. Laczos
+     * Approximation Reference: Numerical Recipes in C
+     * http://www.library.cornell.edu/nr/cbookcpdf.html
+    """
+    cof = [ 76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5 ]
+    y = x
+    tmp = x + 5.5
+    tmp -= ((x + 0.5) * math.log(tmp))
+    ser = 1.000000000190015
+    for j in range(len(cof)):
+        y += 1
+        ser += (cof[j] / y)
+    return (-tmp + math.log(2.5066282746310005 * ser / x))
--- a/sequence.py
+++ b/sequence.py
@@ -195,14 +195,15 @@ class Sequence(object):
 Below are some useful methods for loading data from strings and files.
 Recognize the FASTA format (nothing fancy).
 """
-def readFasta(string, alphabet = None, ignore = False, gappy = False):
+def readFasta(string, alphabet = None, ignore = False, gappy = False, parse_defline = True):
    """ Read the given string as FASTA formatted data and return the list of
        sequences contained within it.
        If alphabet is specified, use it, if None (default) then guess it.
        If ignore is False, errors cause the method to fail.
        If ignore is True, errors will disregard sequence.
        If gappy is False (default), sequence cannot contain gaps,
-        if True gaps are accepted and included in the resulting sequences."""
+        if True gaps are accepted and included in the resulting sequences.
+        If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
    seqlist = []    # list of sequences contained in the string
    seqname = None  # name of *current* sequence
    seqinfo = None
@@ -222,8 +223,11 @@ def readFasta(string, alphabet = None, ignore = False, gappy = False):
            seqinfo = line[1:].split() # skip first char
            if len(seqinfo) > 0:
                try:
-                    parsed = parseDefline(seqinfo[0])
-                    seqname = parsed[0]
+                    if parse_defline:
+                        parsed = parseDefline(seqinfo[0])
+                        seqname = parsed[0]
+                    else:
+                        seqname = seqinfo[0]
                    seqinfo = line[1:]
                except IndexError as errmsg:
                    if not ignore:
@@ -266,14 +270,15 @@ def parseDefline(string):
    elif re.match("^refseq\|\S+\|\S+", s):                   arg = s.split('|');  return (arg[1], arg[2], arg[0], '')
    else: return (s, '', '', '')

-def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
+def readFastaFile(filename, alphabet = None, ignore = False, gappy = False, parse_defline = True):
    """ Read the given FASTA formatted file and return the list of sequences
        contained within it. Note that if alphabet is NOT specified, it will take a
        separate guess for each sequence.
        If ignore is False, errors cause the method to fail.
        If ignore is True, errors will disregard sequence.
        If gappy is False (default), sequence cannot contain gaps,
-        if True gaps are accepted and included in the resulting sequences."""
+        if True gaps are accepted and included in the resulting sequences.
+        If parse_defline is False, the name will be set to everything before the first space, else parsing will be attempted."""
    fh = open(filename)
    seqlist = []
    batch = '' # a batch of rows including one or more complete FASTA entries
@@ -282,7 +287,7 @@ def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
        row = row.strip()
        if len(row) > 0:
            if row.startswith('>') and rowcnt > 0:
-                more = readFasta(batch, alphabet, ignore, gappy)
+                more = readFasta(batch, alphabet, ignore, gappy, parse_defline)
                if len(more) > 0:
                    seqlist.extend(more)
                batch = ''
@@ -290,7 +295,7 @@ def readFastaFile(filename, alphabet = None, ignore = False, gappy = False):
            batch += row + '\n'
            rowcnt += 1
    if len(batch) > 0:
-        more = readFasta(batch, alphabet, ignore, gappy)
+        more = readFasta(batch, alphabet, ignore, gappy, parse_defline)
        if len(more) > 0:
            seqlist.extend(more)
    fh.close()