Commit 63302fa3 authored by Eric Davis's avatar Eric Davis

added more simulations, came up with a way to querry similar reads, and added…

added more simulations, came up with a way to querry similar reads, and added the bucket values which describe where in the read the kmer was found
parent 8af12ba9
......@@ -24,4 +24,4 @@ cdef class Hasher(object):
:param int x: The input value to hash.
:return int: The hash value.
"""
return (x + self._b) % self._c
\ No newline at end of file
return (x + self._b) % self._c
......@@ -11,8 +11,6 @@ __modname__ = 'hasher.py'
import numpy as np
import pyximport; pyximport.install()
from long_read_aligner.cython.chash import Hasher
class HashFactory(object):
......
......@@ -11,15 +11,52 @@ __modname__ = 'kmer.py'
import sys
import math
from collections import defaultdict
from abc import ABCMeta, abstractmethod, abstractproperty
class Read(object):
__slots__ = ['seq', '_sketch']
def __init__(self, seq):
self.seq = seq
self._sketch = None
def setSketch(self, sketch):
self._sketch = sketch
@property
def sketch(self):
return self._sketch
class Kmer(object):
__slots__ = ['seq', '_idx', 'integer_val']
__metaclass__ = ABCMeta
@abstractproperty
def integer_val(self):
pass
@abstractproperty
def idx(self):
pass
@abstractproperty
def __str__(self):
pass
class KmerSingle(Kmer):
__slots__ = ['seq', 'idx', 'integer_val']
def __init__(self, idx, seq):
self.seq = seq
self.integer_val = hash(seq)
self._idx = idx
self.idx = idx
def __str__(self):
return self.seq
......@@ -28,30 +65,47 @@ class Kmer(object):
return self.seq == other.seq
def __repr__(self):
return "<Kmer:{}, idx:{}>".format(self.seq, self._idx)
return "<Kmer:{}, idx:{}>".format(self.seq, self.idx)
def __index__(self):
return self._idx
return self.idx
def __gt__(self, other):
return self._idx > other
return self.idx > other
class KmerPair(Kmer):
__slots__ = ['kmer1', 'kmer2', 'distance', 'integer_val']
def __init__(self, kmer1, kmer2):
self.kmer1 = kmer1.seq
self.kmer2 = kmer2.seq
self.distance = kmer2.idx - kmer1.idx
self.integer_val = hash(self.kmer1 + self.kmer2 + str(self.distance))
@property
def idx(self):
return 'blah'
def __str__(self):
pass
class Window(object):
KMER = Kmer
KMER = KmerSingle
def __init__(self, seq, kmer_size):
def __init__(self, seq, kmer_size, n_buckets):
self.seq = seq
self.kmers = self.makeWords(seq, kmer_size)
self.kmer_pairs = None
self.buckets = getBuckets(n_buckets, len(seq))
self.kmers = self.makeWords(seq, kmer_size)
def __getitem__(self, key):
return self.kmers[key]
@classmethod
def makeWords(cls, w, k):
return [cls.KMER(i, w[i: i + k]) for i in xrange(len(w) - k + 1)]
def makeWords(self, w, k):
return [self.KMER(self.buckets[i], w[i: i + k]) for i in xrange(len(w) - k + 1)]
def __repr__(self):
return "<{}:n_kmers:{}>".format(self.__class__.__name__, len(self.kmers))
......@@ -62,7 +116,28 @@ class Window(object):
yield kmer
def makeKmerPairs(self, kmer):
kmer_idx = kmer._idx
return [Kmer(kmer_idx, ''.join([kmer.seq, next_kmer.seq, str(next_kmer._idx - kmer_idx)])) for next_kmer in self.getDownstreamKmers(kmer)]
return [KmerPair(kmer, next_kmer) for next_kmer in self.getDownstreamKmers(kmer)]
def getBuckets(n_buckets, len_seq):
items_per_bucket = math.ceil(float(len_seq) / float(n_buckets))
running_bucket = range(n_buckets)
current_bucket = running_bucket[0]
buckets = {}
bucket_count = 0
for i in range(len_seq):
if bucket_count < items_per_bucket:
buckets[i] = running_bucket[current_bucket]
bucket_count +=1
else:
current_bucket +=1
bucket_count = 0
buckets[i] = running_bucket[current_bucket]
return buckets
......@@ -12,11 +12,14 @@ __modname__ = "min_hash.py"
import sys
import numpy as np
import operator
from collections import defaultdict
from collections import defaultdict, Counter
from abc import ABCMeta, abstractmethod
from hasher import HashFactory, Hasher
from kmer import Kmer, Window
from hasher import HashFactory
from kmer import Kmer, Window, Read
import pyximport; pyximport.install()
from cython.chash import Hasher
class MinHash(object):
......@@ -31,28 +34,30 @@ class MinHash(object):
self._hash_permutations = HashFactory(Hasher, n_hash_functions).getHashes()
@staticmethod
def getMinimumHash(hashes):
return min(filter(lambda x: x != 0, hashes))
def getMinimumHash(hashes_kmer):
return min(hashes_kmer, key=operator.itemgetter(0))
def calcSignature(self, kmers):
def calcSketch(self, kmers):
"""
Calculates a minhash signature given a set of kmers.
Calculates a minhash Sketch given a set of kmers.
:param list(Kmer) kmers: a list of Kmer instances
:return list(int) signature: A list representing a minhash signature
:return list(int) Sketch: A list representing a minhash Sketch
"""
signature = []
sketch = []
for hasher in self._hash_permutations:
hashes = [hasher(x) for x in kmers]
hashes = [(hasher.hashIt(kmer.integer_val), kmer) for kmer in kmers]
min_hash = self.getMinimumHash(hashes)
signature.append(min_hash)
sketch.append(min_hash)
return signature
return sketch
def approximateJaccard(self, sig1, sig2):
"""
Calculate an approximate Jaccard index based on minhash signatures
Calculate an approximate Jaccard index based on minhash Sketchs
TODO: Assert that sig1 and sig2 were calculated with the same hash functions, otherwise
the approimation is invalid.
:param list(int) sig1:
......@@ -64,77 +69,99 @@ class MinHash(object):
return float(sum(sig1 == sig2)) / float(self._n_hash_functions)
class LoggingMinHash(MinHash):
"""
Class for calculating a min hash signature. Implements logging capabilities to back-lookup original input values
prior to hashing. This removes the need to calculate all kmer pairs and distances exhaustively, which is a
quadratic operation. Alot of book keeping going on here.
"""
class SketchDirector(object):
"""Flow control for generating minhash Sketchs for a given sequence"""
def __init__(self, n_hash_functions):
super(LoggingMinHash, self).__init__(n_hash_functions)
self.hash_to_kmer = {}
__metaclass__ = ABCMeta
def setHashToKmer(self, min_hash_to_int):
"""Updates the hash to int map of the instance"""
self.hash_to_kmer.update(min_hash_to_int)
def getKmer(self, min_hash):
def __init__(self, word_size, n_hashing_functions, hasher):
"""
Gets the kmer string given the minhash value.
Init method for class
:param int word_size: The kmer size to use
:param n_hashing_functions: The number of hashing functions to use.
"""
return self.hash_to_kmer[min_hash]
self.word_size = word_size
self._n_hashing_functions = n_hashing_functions
self._min_hash = hasher(n_hashing_functions)
self._Sketch_cache = {x:defaultdict(list) for x in xrange(n_hashing_functions)}
@staticmethod
def getMinimumHash(hashes_kmer):
return min(hashes_kmer, key=operator.itemgetter(0))
def run(self, seq):
Sketch = self.getSketch(seq)
self.storeSketch(seq, Sketch)
def calcSignature(self, kmers):
"""
Calculates a minhash signature given a set of kmers.
:param list(Kmer) kmers: A list of Kmers instances
"""
def storeSketch(self, seq, Sketch):
for i, min_mer in enumerate(Sketch):
self._Sketch_cache[i][min_mer].append(seq)
primary_signature = []
def querryRead(self, Sketch):
for i, min_mer in enumerate(Sketch):
yield self._Sketch_cache[i][min_mer]
for hasher in self._hash_permutations:
def getSimilarReads(self, read):
similar_reads = []
for q_read_list in self.querryRead(read.sketch):
for q_read in q_read_list:
if q_read != read.seq:
similar_reads.append(q_read)
return Counter(similar_reads)
class GreedyPairSketchDirector(SketchDirector):
hashes = [(hasher.hashIt(x.integer_val), x) for x in kmers if x.integer_val!=0]
min_hash = self.getMinimumHash(hashes)
primary_signature.append(min_hash[0])
self.hash_to_kmer[min_hash[0]] = min_hash[1]
return primary_signature
def __init__(self, word_size, n_hashing_functions, hasher):
super(GreedyPairSketchDirector, self).__init__(word_size, n_hashing_functions, hasher)
def getSketch(self, seq):
"""
Gets the minhash Sketch for a given sequence
:param str seq: A DNA sequence
:return list(int): A list representing the minhash Sketch
"""
window = Window(seq, self.word_size, 2)
primary_Sketch = self._min_hash.calcSketch(window.kmers)
kmer_pairs = []
for min_hash in primary_Sketch:
kmer_pairs.extend(window.makeKmerPairs(min_hash[1]))
return [x[0] for x in self._min_hash.calcSketch(kmer_pairs)]
class SignatureDirector(object):
"""Flow control for generating minhash signatures for a given sequence"""
class SingleSketchDirector(SketchDirector):
def __init__(self, word_size, n_hashing_functions, hasher):
super(SingleSketchDirector, self).__init__(word_size, n_hashing_functions, hasher)
def __init__(self, word_size, n_hashing_functions):
def getSketch(self, seq):
"""
Init method for class
:param int word_size: The kmer size to use
:param int n_hashing_functions: The number of hashing functions to use.
Gets the minhash Sketch for a given sequence
:param str seq: A DNA sequence
:return list(int): A list representing the minhash Sketch
"""
self.word_size = word_size
self._min_hash = LoggingMinHash(n_hashing_functions)
window = Window(seq, self.word_size, 1)
return [x[0] for x in self._min_hash.calcSketch(window.kmers)]
def getSignature(self, seq):
class ExhaustivePairSketchDirector(SketchDirector):
def __init__(self, word_size, n_hashing_functions, hasher):
super(ExhaustivePairSketchDirector, self).__init__(word_size, n_hashing_functions, hasher)
def getSketch(self, seq):
"""
Gets the minhash signature for a given sequence
Gets the minhash Sketch for a given sequence
:param str seq: A DNA sequence
:return list(int): A list representing the minhash signature
:return list(int): A list representing the minhash Sketch
"""
window = Window(seq, self.word_size)
primary_signature = self._min_hash.calcSignature(window.kmers)
kmer_pairs = []
window = Window(seq, self.word_size, 2)
for min_hash in primary_signature:
min_kmer = self._min_hash.getKmer(min_hash)
kmer_pairs.extend(window.makeKmerPairs(min_kmer))
return self._min_hash.calcSignature(kmer_pairs)
pairs = [pair for kmer in window.kmers for pair in window.makeKmerPairs(kmer)]
return [x[0] for x in self._min_hash.calcSketch(pairs)]
......@@ -13,29 +13,30 @@ import unittest
import cProfile
from pstats import Stats
from long_read_aligner.min_hash import MinHash, LoggingMinHash, SignatureDirector
from long_read_aligner.kmer import Kmer, Window
from long_read_aligner.utils.string_utils import CreateWindow, CreateKMERPairs
from long_read_aligner.kmer import Read
from long_read_aligner.min_hash import MinHash, GreedyPairSketchDirector, SingleSketchDirector, ExhaustivePairSketchDirector
from long_read_aligner.kmer import KmerSingle, Window
from long_read_aligner.utils.string_utils import CreateWindow, CreateKMERPairs, PerturbWindow
class MinHashTests(unittest.TestCase):
def test_calcSignature(self):
"""Test that the signature lenght is equal to the number of hashing fucntions"""
def test_calcSketch(self):
"""Test that the Sketch lenght is equal to the number of hashing fucntions"""
n_hash_permutations = 10
l_min_hash = LoggingMinHash(n_hash_permutations)
l_min_hash = MinHash(n_hash_permutations)
kmers = ['AA', 'TT', 'CC', 'GG']
kmers = [Kmer(i, x) for i, x in enumerate(kmers)]
kmers = [KmerSingle(i, x) for i, x in enumerate(kmers)]
signature = l_min_hash.calcSignature(kmers)
self.assertEqual(len(signature), n_hash_permutations)
Sketch = l_min_hash.calcSketch(kmers)
self.assertEqual(len(Sketch), n_hash_permutations)
def test_getMinimumHash(self):
"""Test that we can get the minimum hash"""
hashes = [0, 1, 2, 3, 4, 5]
min_hash = MinHash.getMinimumHash(hashes)
self.assertEquals(1, min_hash)
#def test_getMinimumHash(self):
# """Test that we can get the minimum hash"""
# hashes = [0, 1, 2, 3, 4, 5]
# min_hash = MinHash.getMinimumHash(hashes)
# self.assertEquals(1, min_hash)
def test_approximateJaccardGood(self):
"""Test that the approximate jaccard is 1 at max identity"""
......@@ -53,7 +54,7 @@ class MinHashTests(unittest.TestCase):
self.assertEquals(0, jaccard)
class SignatureDirectorTests(unittest.TestCase):
class SketchDirectorTests(unittest.TestCase):
def setUp(self):
"""Enable profiling"""
......@@ -69,24 +70,78 @@ class SignatureDirectorTests(unittest.TestCase):
p.print_stats()
print "\n--->>>"
def test_getSignature(self):
def test_getSketch(self):
""""""
test_seq = "AAATTACCCGGG"
test_seq2 = "AAATTTCCCGGG"
director = SignatureDirector(3, 100)
director = SingleSketchDirector(3, 100, MinHash)
sig1 = director.getSignature(test_seq)
sig2 = director.getSignature(test_seq2)
sig1 = director.getSketch(test_seq)
sig2 = director.getSketch(test_seq2)
jaccard = director._min_hash.approximateJaccard(sig1, sig2)
print jaccard
self.assertTrue(0 < jaccard < 1)
def test_time(self):
"""Kind of a stress test to get some time benchmarks. Here we generate a signature for a 1kb window"""
# def test_time(self):
# """Kind of a stress test to get some time benchmarks. Here we generate a Sketch for a 1kb window"""
# test_seq = CreateWindow(1000)
# director = SingleSketchDirector(8, 100, MinHash)
# director.getSketch(test_seq)
def test_getSimilarBig(self):
"""Kind of a stress test to get some time benchmarks. Here we generate a Sketch for a 1kb window"""
print "SINGLE"
test_seq = CreateWindow(1000)
director = SignatureDirector(8, 100)
director.getSignature(test_seq)
q_test_seq = PerturbWindow(test_seq, 0.3)
director = SingleSketchDirector(16, 200, MinHash)
director.run(test_seq)
Sketch = director.getSketch(q_test_seq)
q_test_read = Read(q_test_seq)
q_test_read.setSketch(Sketch)
print director.getSimilarReads(q_test_read)
# def test_SketchCache(self):
# """"""
# print "SINGLE"
# test_seq = "AAATTACCCGGG"
# test_seq2 = "AAATTTCCCGGG"
# test_seq3 = "AAATTTCCAGGT"
# director = SingleSketchDirector(8, 400, MinHash)
# director.run(test_seq)
# director.run(test_seq2)
# director.run(test_seq3)
# Sketch = director.getSketch(test_seq)
# test_read = Read(test_seq)
# test_read.setSketch(Sketch)
# print director.getSimilarReads(test_read)
def test_exhaustive(self):
print "EXHAUSTIVE"
test_seq = CreateWindow(1000)
q_test_seq = PerturbWindow(test_seq, 0.3)
director = ExhaustivePairSketchDirector(8, 200, MinHash)
director.run(test_seq)
Sketch = director.getSketch(q_test_seq)
q_test_read = Read(q_test_seq)
q_test_read.setSketch(Sketch)
print director.getSimilarReads(q_test_read)
def test_greedy(self):
print "GREEDY"
test_seq = CreateWindow(1000)
q_test_seq = PerturbWindow(test_seq, 0.3)
director = GreedyPairSketchDirector(8, 200, MinHash)
director.run(test_seq)
Sketch = director.getSketch(q_test_seq)
q_test_read = Read(q_test_seq)
q_test_read.setSketch(Sketch)
print director.getSimilarReads(q_test_read)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment