Commit cbce34aa authored by akdel's avatar akdel

major commit with cli

parent b9a2bdbf
{
"python.linting.pylintEnabled": true,
"python.linting.banditEnabled": false,
"python.linting.enabled": true
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
import numpy as np
from dataclasses import dataclass
import typing as ty
import itertools
from OptiMap import molecule_struct as ms
def generate_bnx_arrays(bnx_filename: str) -> ty.Iterator[dict]:
lines: ty.List[str] = list()
for k, g in itertools.groupby((x for x in open(bnx_filename, "r")), lambda x: x.split("\t")[0]):
if k.startswith("#"):
continue
else:
if len(lines) == 4:
yield {
"info": [x.strip() for x in lines[0].split("\t")],
"labels": [float(x) for x in lines[1].split()[1:]],
"label_snr": [float(x) for x in lines[2].split()[1:]],
"raw_intensities": [float(x) for x in lines[3].split()[1:]],
"signal": None
}
lines: ty.List[str] = [list(g)[0]]
else:
lines.append(list(g)[0])
@dataclass
class SquareWave:
labels: ty.List[int]
length: int
wave_width: int
wave: np.ndarray
zoom: int
idx: int
@classmethod
def from_bnx_line(cls, bnx_array_entry: dict, reverse=False,
zoom_factor: int = 500,
snr: float = 3.5,
width: int = 10) -> "SquareWave":
index: int = int(bnx_array_entry["info"][1])
length: float = float(bnx_array_entry["info"][2])
labels: ty.List[float] = list((np.array(bnx_array_entry["labels"][:-1])).astype(int)[
np.array(bnx_array_entry["label_snr"]) >= snr])
if reverse:
index *= -1
return cls.from_labels(labels,
length,
zoom=zoom_factor,
reverse=reverse,
width=width,
idx=index)
@classmethod
def from_optimap_molecule(cls,
optimap_molecule: ms.MoleculeNoBack,
molecule_id: int,
reverse: bool = False,
zoom: int = 1,
log: bool = False) -> 'SquareWave':
labels: ty.List[int] = [int(x) for x in optimap_molecule.nick_coordinates]
if reverse:
labels: ty.List[int] = [(optimap_molecule.nick_signal.shape[0] - x) for x in labels]
if log:
wave: np.ndarray = optimap_molecule.log_signal
wave[wave < 1] = 0
wave[wave >= 1] = 1
else:
wave: np.ndarray = optimap_molecule.nick_signal
return cls(labels, wave.shape[0], -1, wave if not reverse else wave[::-1],
zoom, molecule_id if not reverse else -molecule_id)
@classmethod
def from_labels(cls,
labels: ty.List[float],
length: float,
zoom: int = 1,
width: int = 10,
reverse: bool = False,
idx: int = 0) -> 'SquareWave':
if reverse:
labels: ty.List[float] = [(length - x) for x in labels]
if zoom < 1:
zoom: int = 1
assert length >= max(labels)
half_width: int = width//2
labels: ty.List[int] = [int(x // zoom) for x in labels]
length: int = int(length // zoom)
wave: np.ndarray = np.zeros(length, dtype="int8")
for label in labels:
start, end = max(0, label - half_width), min(length, label + half_width)
wave[start: end] = 1
assert len(labels) > 1
return cls(labels, length, width, wave, zoom, idx)
def reform_wave_from_labels_with_width(self, width: int = 10) -> "SquareWave":
return self.from_labels(self.labels, self.length, 1, width, False, self.idx)
def segment(self, length: int = 150000) -> np.ndarray:
segments: ty.List[ty.List[int]] = list()
length //= self.zoom
for label in self.labels:
if (self.length - label) >= length:
segments.append(list(self.wave[label: label + length]))
else:
continue
return np.array(segments).astype("uint8")
def compress(self, length: int = 150000, nbits: int = 32, limit: ty.Tuple[float, float] = (0.25, 0.75)):
segments: np.ndarray = self.segment(length)
if not len(segments):
return None
compressed_segments: np.ndarray = np.zeros((segments.shape[0], nbits), dtype=np.uint8)
ranges = np.round(np.linspace(0, segments.shape[1], nbits), 0).astype(int)
interval: int = ranges[1]
limit_lower, limit_upper = int(nbits * limit[0]), int(nbits * limit[1])
for i, segment in enumerate(segments):
for j, adjusted_j in enumerate(ranges):
compressed_segments[i, j] = np.sum(segment[adjusted_j: adjusted_j + interval]) >= (interval // 2)
compressed_segments: np.ndarray = np.array([x for x in compressed_segments if limit_lower <= np.sum(x) <= limit_upper])
if not len(compressed_segments):
return None
return np.packbits(compressed_segments).view(f"uint{nbits}").flatten()
if __name__ == "__main__":
length_ = 350
labels_ = list(np.sort(np.random.randint(0, length_, 25)).astype(float))
import matplotlib
matplotlib.use("Qt5Agg")
import matplotlib.pyplot as plt
sq = SquareWave.from_labels(labels_, length_, width=8)
segs = sq.compress(75)
for seg in np.unpackbits(segs.view("uint8")).reshape((segs.shape[0], -1)):
plt.plot(sq.wave)
plt.show()
plt.plot(seg)
plt.show()
......@@ -24,6 +24,7 @@ def _normalized_correlation(mol1, mol2, self_corr1, self_corr2, res, total, limi
mol2_self_segment = np.dot(mol2[-(total-i):],mol2[-(total-i):])
res[i] = np.dot(mol1_segment,mol2_segment)/sqrt(mol1_self_segment*mol2_self_segment)
@nb.njit
def normalized_correlation(mol1, mol2, limit=100):
self_corr1 = mol1**2
......@@ -31,3 +32,20 @@ def normalized_correlation(mol1, mol2, limit=100):
res = np.zeros(mol1.shape[0] + mol2.shape[0], dtype=np.float64)
_normalized_correlation(mol2, mol1, self_corr1, self_corr2, res, res.shape[0], limit)
return res
@nb.njit(parallel=True)
def get_filtered_alignments(mol, prerest, prelens, filter_ids, limit=250):
res = np.zeros((filter_ids.shape[0], 2))
rest = prerest[filter_ids]
lens = prelens[filter_ids]
for i in nb.prange(res.shape[0]):
current_mol = np.zeros(rest[i][:lens[i]].shape)
current_mol[:] = rest[i][:lens[i]]
if current_mol.shape[0] > mol.shape[0]:
f = np.max(normalized_correlation(current_mol, mol, limit=limit))
else:
f = np.max(normalized_correlation(mol, current_mol, limit=limit))
res[i, 0] = f
res[i, 1] = filter_ids[i]
return res
import fire
from OptiMap.OptiSpeed import compression as comp
from OptiMap.OptiSpeed import square_wave as sw
import ray
import typing as ty
from OptiMap import correlation_struct as cs
from OptiMap import molecule_struct as ms
import enum
class InputType(enum.Enum):
OptiMap = "optimap"
BNX = "bnx"
WrongType = "none"
class OptiMapType(enum.Enum):
LogMolecule = "log"
RawMolecule = "raw"
SquareWaveMolecule = "square_wave"
def compute_pairwise_deep(input_filename: str,
output_filename: str = "out.tsv",
molecule_type: str = "square_wave",
first_score: float = 0.9,
second_score: float = 0.75,
combine_sparse: bool = False,
number_of_threads: int = 4):
if first_score <= second_score:
from warnings import warn
warn("second score should be lower to have an effect", Warning)
if input_filename.endswith(".bnx"):
this_type = InputType.BNX
elif input_filename.endswith(".npy"):
this_type = InputType.OptiMap
else:
this_type = InputType.WrongType
if this_type == InputType.BNX:
molecule_array: comp.MoleculeArray = comp.CompressedAndScored.from_bnx_arrays(input_filename,
min_label=10).to_molecule_array()
elif this_type == InputType.OptiMap:
import numpy as np
molecules: ty.List[ms.MoleculeNoBack] = [ms.MoleculeNoBack(raw_molecule, 3.) for raw_molecule in
np.load(input_filename, allow_pickle=True)]
if molecule_type == "square_wave":
sq_waves: ty.List[sw.SquareWave] = [
sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True).reform_wave_from_labels_with_width(13)
for i, molecule in enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True,
reverse=True).reform_wave_from_labels_with_width(13) for
i, molecule in enumerate(molecules)]
elif molecule_type == "log":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True, reverse=True) for
i, molecule in enumerate(molecules)]
elif molecule_type == "raw":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False, reverse=True) for
i, molecule in enumerate(molecules)]
else:
exit(1)
molecule_array: comp.MoleculeArray = comp.CompressedAndScored.from_waves(sq_waves).to_molecule_array()
else:
exit(1)
if not combine_sparse:
comp.OptiSpeedResults \
.from_molecule_array_all_vs_all(molecule_array)\
.to_all_correlation_results(molecule_array, thr=first_score)\
.transitive_closure(molecule_array, thr=second_score)\
.write_to_file(output_filename)
else:
comp.OptiSpeedResults \
.from_molecule_array_all_vs_all(molecule_array) \
.to_all_correlation_results(molecule_array, thr=first_score) \
.transitive_closure(molecule_array, thr=second_score) \
.pairs_to_sparse_correlation_results(molecule_array,
thr=second_score,
number_of_threads=number_of_threads,
minimum_overlapping_labels=12)\
.write_to_file(output_filename)
if __name__ == "__main__":
fire.Fire(compute_pairwise_deep)
import fire
from OptiMap.OptiSpeed import compression as comp
from OptiMap.OptiSpeed import square_wave as sw
import ray
import typing as ty
from OptiMap import correlation_struct as cs
from OptiMap import molecule_struct as ms
import enum
class InputType(enum.Enum):
OptiMap = "optimap"
BNX = "bnx"
WrongType = "none"
class OptiMapType(enum.Enum):
LogMolecule = "log"
RawMolecule = "raw"
SquareWaveMolecule = "square_wave"
def compute_pairwise_naive(input_filename: str,
output_filename: str = "out.tsv",
molecule_type: str = "square_wave",
score: float = 0.9,
min_overlap_length: int = 250):
if input_filename.endswith(".bnx"):
this_type = InputType.BNX
elif input_filename.endswith(".npy"):
this_type = InputType.OptiMap
else:
this_type = InputType.WrongType
if this_type == InputType.BNX:
molecule_array: comp.MoleculeArray = comp.CompressedAndScored.from_bnx_arrays(input_filename,
min_label=10).to_molecule_array()
elif this_type == InputType.OptiMap:
import numpy as np
molecules: ty.List[ms.MoleculeNoBack] = [ms.MoleculeNoBack(raw_molecule, 3.) for raw_molecule in
np.load(input_filename, allow_pickle=True)]
if molecule_type == "square_wave":
sq_waves: ty.List[sw.SquareWave] = [
sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True).reform_wave_from_labels_with_width(13)
for i, molecule in enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True,
reverse=True).reform_wave_from_labels_with_width(13) for
i, molecule in enumerate(molecules)]
elif molecule_type == "log":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True, reverse=True) for
i, molecule in enumerate(molecules)]
elif molecule_type == "raw":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False, reverse=True) for
i, molecule in enumerate(molecules)]
else:
exit(1)
molecule_array: comp.MoleculeArray = comp.CompressedAndScored.from_waves(sq_waves).to_molecule_array()
else:
exit(1)
comp.OptiSpeedResults \
.from_molecule_array_all_vs_all(molecule_array) \
.to_all_correlation_results(molecule_array, thr=score, length_limit=min_overlap_length) \
.write_to_file(output_filename)
if __name__ == "__main__":
fire.Fire(compute_pairwise_naive)
import fire
from OptiMap import correlation_struct as cs
from OptiMap import molecule_struct as ms
from OptiMap.OptiSpeed import square_wave as sw
from OptiMap.OptiSpeed import compression as comp
import typing as ty
import ray
import enum
class InputType(enum.Enum):
OptiMap = "optimap"
BNX = "bnx"
WrongType = "none"
class OptiMapType(enum.Enum):
LogMolecule = "log"
RawMolecule = "raw"
SquareWaveMolecule = "square_wave"
def compute_pairwise_sparse(input_filename: str, min_number_of_nicks: int = 12,
molecule_type: str = "square_wave",
output_filename: str = "out.tsv",
number_of_threads: int = 4,
score: float = 0.8):
if input_filename.endswith(".bnx"):
this_type = InputType.BNX
elif input_filename.endswith(".npy"):
this_type = InputType.OptiMap
else:
this_type = InputType.WrongType
if this_type == InputType.BNX:
molecule_array: comp.MoleculeArray = comp.CompressedAndScored.from_bnx_arrays(input_filename,
min_label=10).to_molecule_array()
elif this_type == InputType.OptiMap:
import numpy as np
molecules: ty.List[ms.MoleculeNoBack] = [ms.MoleculeNoBack(raw_molecule, 3.) for raw_molecule in
np.load(input_filename, allow_pickle=True)]
if molecule_type == "square_wave":
sq_waves: ty.List[sw.SquareWave] = [
sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True).reform_wave_from_labels_with_width(13)
for i, molecule in enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True,
reverse=True).reform_wave_from_labels_with_width(13) for
i, molecule in enumerate(molecules)]
elif molecule_type == "log":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True, reverse=True) for
i, molecule in enumerate(molecules)]
elif molecule_type == "raw":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False, reverse=True) for
i, molecule in enumerate(molecules)]
else:
exit(1)
molecule_array: comp.MoleculeArray = comp.CompressedAndScored.from_waves(sq_waves).to_molecule_array()
else:
exit(1)
comp.OptiSpeedResults \
.from_molecule_array_all_vs_all(molecule_array)\
.pairs_to_sparse_correlation_results(molecule_array,
number_of_threads=number_of_threads,
thr=score,
minimum_overlapping_labels=min_number_of_nicks)\
.write_to_file(output_filename)
if __name__ == "__main__":
fire.Fire(compute_pairwise_sparse)
import fire
from OptiMap.OptiSpeed import compression as comp
from OptiMap.OptiSpeed import square_wave as sw
import ray
import typing as ty
from OptiMap import correlation_struct as cs
from OptiMap import molecule_struct as ms
import enum
class InputType(enum.Enum):
OptiMap = "optimap"
BNX = "bnx"
WrongType = "none"
class OptiMapType(enum.Enum):
LogMolecule = "log"
RawMolecule = "raw"
SquareWaveMolecule = "square_wave"
def compute_pairwise_from_optispeed(input_filename: str,
output_filename: str = "out.tsv",
molecule_type: str = "square_wave",
first_score: float = 0.9,
second_score: float = 0.75,
combine_sparse: bool = False,
number_of_threads: int = 4,
hamming_threshold: int = 3,
top_optispeed: int = 30,
optispeed_file: str = "optispeed_results.opt"):
if first_score <= second_score:
from warnings import warn
warn("second score should be lower to have an effect", Warning)
if input_filename.endswith(".bnx"):
this_type = InputType.BNX
elif input_filename.endswith(".npy"):
this_type = InputType.OptiMap
else:
this_type = InputType.WrongType
if this_type == InputType.BNX:
compressed = comp.CompressedAndScored.from_bnx_arrays(input_filename, min_label=10, length=120*500)
compressed.run_optispeed_and_output_results(optispeed_file, match_score_threshold=0.,
hamming_threshold=hamming_threshold,
top=top_optispeed)
molecule_array: comp.MoleculeArray = compressed.from_bnx_arrays(input_filename,
min_label=10).to_molecule_array()
elif this_type == InputType.OptiMap:
import numpy as np
molecules: ty.List[ms.MoleculeNoBack] = [ms.MoleculeNoBack(raw_molecule, 3.) for raw_molecule in
np.load(input_filename, allow_pickle=True)]
if molecule_type == "square_wave":
sq_waves: ty.List[sw.SquareWave] = [
sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True).reform_wave_from_labels_with_width(13)
for i, molecule in enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True,
reverse=True).reform_wave_from_labels_with_width(13) for
i, molecule in enumerate(molecules)]
elif molecule_type == "log":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=True, reverse=True) for
i, molecule in enumerate(molecules)]
elif molecule_type == "raw":
sq_waves: ty.List[sw.SquareWave] = [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False) for
i, molecule in
enumerate(molecules)]
sq_waves += [sw.SquareWave.from_optimap_molecule(molecule, i + 1, log=False, reverse=True) for
i, molecule in enumerate(molecules)]
else:
exit(1)
compressed: comp.CompressedAndScored = comp.CompressedAndScored.from_waves(sq_waves, length=120,
segment_limit=(0.1, 0.7))
compressed.run_optispeed_and_output_results(optispeed_file, match_score_threshold=0.,
hamming_threshold=hamming_threshold, top=top_optispeed)
molecule_array: comp.MoleculeArray = compressed.to_molecule_array()
else:
exit(1)
if not combine_sparse:
comp.OptiSpeedResults \
.from_optispeed_results_file(optispeed_file)\
.to_all_correlation_results(molecule_array, thr=first_score)\
.transitive_closure(molecule_array, thr=second_score)\
.write_to_file(output_filename)
else:
comp.OptiSpeedResults \
.from_optispeed_results_file(optispeed_file) \
.to_all_correlation_results(molecule_array, thr=first_score) \
.transitive_closure(molecule_array, thr=second_score) \
.pairs_to_sparse_correlation_results(molecule_array,
thr=second_score,
number_of_threads=number_of_threads,
minimum_overlapping_labels=12)\
.write_to_file(output_filename)
if __name__ == "__main__":
fire.Fire(compute_pairwise_from_optispeed)
......@@ -2,9 +2,10 @@ from distutils.core import setup
setup(name='OptiMap',
version='1.0',
description='Bionano Tiff to Optical Map Signals',
description='An optical map molecule alignment tool.',
author='Mehmet Akdel',
author_email='mehmet.akdel@wur.nl',
url='https://gitlab.com/akdel/',
packages=['OptiMap'],
install_requires=["sqlalchemy", "numpy", "numba", "scipy", "intervaltree", "matplotlib", "cython"],)
packages=['OptiMap', "OptiMap/OptiSpeed"],
install_requires=["sqlalchemy", "numpy", "numba", "scipy", "intervaltree",
"matplotlib", "cython", "Simple-LSH", "fire", "ray"])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment