Skip to content

Commit

Permalink
Optimize gaps and reduce methods (#34)
Browse files Browse the repository at this point in the history
Optimizing a couple of methods in `IRanges`:
- Update `gaps` and `reduce` to slightly faster NumPy based operations.
- Switch `np.array` to `np.asarray`
  • Loading branch information
jkanche committed Jun 20, 2024
1 parent f40ec22 commit 9491680
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 117 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## Version 0.2.8

Optimizing a couple of methods in `IRanges`:

- Update `gaps` and `reduce` to slightly faster NumPy based operations.
- Switch `np.array` to `np.asarray`

## Version 0.2.7

Changes to be compatible with NumPy's 2.0 release:
Expand All @@ -9,7 +16,7 @@ Changes to be compatible with NumPy's 2.0 release:
## Version 0.2.4

- Support coersion from `IRanges` to Polars and vice-versa
- Support coercion from `IRanges` to Polars and vice-versa
- Setting up `myst_nb` to execute snippets in tutorial/documentation markdown files

## Version 0.2.3
Expand Down
188 changes: 84 additions & 104 deletions src/iranges/IRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def _sanitize_vec_argument(
if isinstance(vec, int):
return vec
elif ut.is_list_of_type(vec, int, ignore_none=allow_none):
vec = np.array(vec)
vec = np.asarray(vec)

if len(vec) < _size:
raise ValueError("Provided argument must match the number of intervals.")
Expand Down Expand Up @@ -852,7 +852,7 @@ def coverage(
raise TypeError("'width' must be an integer or float.")

if isinstance(width, (np.ndarray, list)):
width = max(width)
width = width.max()

cov, _ = create_np_interval_vector(new_ranges, force_size=width, value=weight)
return cov
Expand All @@ -865,8 +865,8 @@ def range(self) -> "IRanges":
the minimum of all the start positions, Maximum of all end positions.
"""

min_start = min(self.start)
max_end = max(self.end)
min_start = self.start.min()
max_end = self.end.max()

return IRanges([min_start], [max_end - min_start])

Expand Down Expand Up @@ -896,60 +896,43 @@ def reduce(
if min_gap_width < 0:
raise ValueError("'min_gap_width' cannot be negative.")

_order = self.order()
order = self.order()
starts = self._start[order]
widths = self._width[order]
ends = starts + widths - 1

if drop_empty_ranges:
valid_mask = widths > 0
starts = starts[valid_mask]
ends = ends[valid_mask]
widths = widths[valid_mask]
order = np.array(order)[valid_mask]

gaps = np.r_[starts[1:] - ends[:-1], np.inf]
merge_mask = np.r_[True, gaps <= min_gap_width][:-1]
merge_groups = np.cumsum(~merge_mask)
unique_groups = np.unique(merge_groups)

result_starts = []
result_widths = []
result_revmaps = []
counter = 0

def get_elem_counter(idx):
elem = self[idx]
start = elem.start[0]
end = elem.end[0] - 1
width = elem.width[0]
return start, end, width

current_start, current_end, _ = get_elem_counter(_order[counter])
current_revmaps = [_order[counter]]
for group in unique_groups:
group_mask = merge_groups == group
group_starts = starts[group_mask]
group_ends = ends[group_mask]
group_indices = order[group_mask]

counter += 1
while counter < len(_order):
merge = False
start = group_starts.min()
end = group_ends.max()
width = end - start + 1

o = _order[counter]
_idx_start, _idx_end, _idx_width = get_elem_counter(o)
_gap_width = _idx_start - current_end

if _gap_width <= min_gap_width:
merge = True

if merge is True:
if current_end < _idx_end:
current_end = _idx_end

current_revmaps.append(o)
counter += 1
else:
if not (
drop_empty_ranges is True and current_end - current_start + 1 == 0
):
result_starts.append(current_start)
result_widths.append(current_end - current_start + 1)
result_revmaps.append(current_revmaps)

current_revmaps = [o]
current_start = _idx_start
current_end = _idx_end
counter += 1

result_starts.append(current_start)
result_widths.append(current_end - current_start + 1)
result_revmaps.append(current_revmaps)
result_starts.append(start)
result_widths.append(width)
result_revmaps.append(group_indices.tolist())

result = IRanges(result_starts, result_widths)

if with_reverse_map is True:
if with_reverse_map:
result._mcols.set_column("revmap", result_revmaps, in_place=True)

return result
Expand All @@ -976,9 +959,14 @@ def order(self, decreasing: bool = False) -> np.ndarray:
Returns:
NumPy vector containing index positions in the sorted order.
"""
intvals = sorted(self._get_intervals_as_list(), reverse=decreasing)
order = [o[2] for o in intvals]
return np.array(order)
order_buf = sorted(
range(len(self)), key=lambda i: (self._start[i], self._width[i])
)

if decreasing:
return np.asarray(order_buf[::-1])

return np.asarray(order_buf)

def sort(self, decreasing: bool = False, in_place: bool = False) -> "IRanges":
"""Sort the intervals.
Expand Down Expand Up @@ -1013,52 +1001,44 @@ def gaps(self, start: Optional[int] = None, end: Optional[int] = None) -> "IRang
Returns:
A new ``IRanges`` is with the gap regions.
"""
_order = self.order()

overlap_start = min(self.start) if start is None else start
overlap_end = overlap_start - 1 if start is not None else None

result_starts = []
result_widths = []
out_ranges = []
order_buf = self.order()

def get_elem_counter(idx):
elem = self[idx]
start = elem.start[0]
end = elem.end[0] - 1
width = elem.width[0]
return start, end, width

for i in range(len(_order)):
_start, _end, _width = get_elem_counter(_order[i])
if start is not None:
max_end = start - 1
else:
max_end = float("inf")

if _width == 0:
for i in order_buf:
width_j = self._width[i]
if width_j == 0:
continue
start_j = self._start[i]
end_j = start_j + width_j - 1

if overlap_end is None:
overlap_end = _end
if max_end == float("inf"):
max_end = end_j
else:
_gap_start = overlap_end + 1

if end is not None and _start > end + 1:
_start = end + 1

_gap_width = _start - _gap_start

if _gap_width >= 1:
result_starts.append(_gap_start)
result_widths.append(_gap_width)
overlap_end = _end
elif _end > overlap_end:
overlap_end = _end

if end is not None and overlap_end >= end:
gapstart = max_end + 1
if end is not None and start_j > end + 1:
start_j = end + 1
gapwidth = start_j - gapstart
if gapwidth >= 1:
out_ranges.append((gapstart, gapwidth))
max_end = end_j
elif end_j > max_end:
max_end = end_j

if end is not None and max_end >= end:
break

if end is not None and overlap_end is not None and overlap_end < end:
result_starts.append(overlap_end + 1)
result_widths.append(end - overlap_end)
if end is not None and max_end is not None and max_end < end:
gapstart = max_end + 1
gapwidth = end - max_end
out_ranges.append((gapstart, gapwidth))

return IRanges(result_starts, result_widths)
_gapstarts, _gapends = zip(*out_ranges)
return IRanges(_gapstarts, _gapends)

# folows the same logic as in https://stackoverflow.com/questions/55480499/split-set-of-intervals-into-minimal-set-of-disjoint-intervals
# otherwise too much magic happening here - https://github.com/Bioconductor/IRanges/blob/5acb46b3f2805f7f74fe4cb746780e75f8257a83/R/inter-range-methods.R#L389
Expand Down Expand Up @@ -1254,8 +1234,8 @@ def narrow(

counter += 1

output._start = np.array(new_starts)
output._width = np.array(new_widths)
output._start = np.asarray(new_starts)
output._width = np.asarray(new_widths)
return output

def resize(
Expand Down Expand Up @@ -1330,7 +1310,7 @@ def resize(
counter += 1

output = self._define_output(in_place)
output._start = np.array(new_starts)
output._start = np.asarray(new_starts)
output._width = (
np.repeat(_awidth, len(self)) if isinstance(_awidth, int) else _awidth
)
Expand Down Expand Up @@ -1571,7 +1551,7 @@ def overlap_indices(

counter += 1

return np.array(overlaps)
return np.asarray(overlaps)

########################
#### set operations ####
Expand Down Expand Up @@ -1621,8 +1601,8 @@ def setdiff(self, other: "IRanges") -> "IRanges":
if not isinstance(other, IRanges):
raise TypeError("'other' is not an `IRanges` object.")

start = min(min(self.start), min(other.start))
end = max(max(self.end), max(other.end))
start = min(self.start.min(), other.start.min())
end = max(self.end.max(), other.end.max())

x_gaps = self.gaps(start=start, end=end)
x_gaps_u = x_gaps.union(other)
Expand All @@ -1648,8 +1628,8 @@ def intersect(self, other: "IRanges") -> "IRanges":
if not isinstance(other, IRanges):
raise TypeError("'other' is not an `IRanges` object.")

start = min(min(self.start), min(other.start))
end = max(max(self.end), max(other.end))
start = min(self.start.min(), other.start.min())
end = max(self.end.max(), other.end.max())

_gaps = other.gaps(start=start, end=end)
_inter = self.setdiff(_gaps)
Expand All @@ -1668,7 +1648,7 @@ def _build_ncls_index(self):

if not hasattr(self, "_ncls"):
self._ncls = NCLS(
self.start, self.end, np.array([i for i in range(len(self))])
self.start, self.end, np.asarray([i for i in range(len(self))])
)

def _delete_ncls_index(self):
Expand All @@ -1690,7 +1670,7 @@ def _generic_find_hits(
new_starts = query._start - gap_start - 1
new_ends = query.end + gap_end + 1
_res = self._ncls.all_overlaps_both(
new_starts, new_ends, np.array(range(len(query)))
new_starts, new_ends, np.asarray(range(len(query)))
)
all_overlaps = [[] for _ in range(len(query))]

Expand Down Expand Up @@ -1842,7 +1822,7 @@ def count_overlaps(
min_overlap=min_overlap,
delete_index=delete_index,
)
return np.array([len(x) for x in _overlaps])
return np.asarray([len(x) for x in _overlaps])

def subset_by_overlaps(
self,
Expand Down Expand Up @@ -1909,8 +1889,8 @@ def _generic_search(
select,
delete_index=False,
):
min_start = min(self.start)
max_end = max(self.end)
min_start = self.start.min()
max_end = self.end.max()
hits = []
for _, val in query:
_iterate = True
Expand Down Expand Up @@ -2112,7 +2092,7 @@ def distance(self, query: "IRanges") -> np.ndarray:

all_distances.append(distance)

return np.array(all_distances)
return np.asarray(all_distances)

########################
#### pandas interop ####
Expand Down
22 changes: 10 additions & 12 deletions src/iranges/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def create_np_interval_vector(

max_end = force_size
if max_end is None:
max_end = max(intervals.get_end())
max_end = intervals.get_end().max()
else:
max_end += 1

Expand Down Expand Up @@ -89,16 +89,14 @@ def calc_gap_and_overlap(
Interval containing start and end positions.
`end` is non-inclusive.
"""
_gap = None
_overlap = None

if first[0] < second[1] and first[1] > second[0]:
if min(first[1], second[1]) > max(first[0], second[0]):
_overlap = min(first[1], second[1]) - max(first[0], second[0])
else:
_gap = None
if second[0] >= first[1]:
_gap = second[0] - first[1]
elif first[0] >= second[1]:
_gap = first[0] - second[1]
return (None, _overlap)

_gap = None
if second[0] >= first[1]:
_gap = second[0] - first[1]
elif first[0] >= second[1]:
_gap = first[0] - second[1]

return (_gap, _overlap)
return (_gap, None)

0 comments on commit 9491680

Please sign in to comment.