Added faster comment splitting

* added functions to copy a segment & its base so comments can be marked on the copy of the base and show up in the copy of the segment
This commit is contained in:
Rob McMullen 2017-03-13 07:05:37 -07:00
parent afa9c9786a
commit f1b0f5ebac
2 changed files with 138 additions and 19 deletions

View File

@ -157,6 +157,14 @@ class SegmentData(object):
buf = cStringIO.StringIO(self.data[:]) buf = cStringIO.StringIO(self.data[:])
return buf return buf
@property
def data_base(self):
return self.data.base if self.data.base is not None else self.data
@property
def style_base(self):
return self.style.base if self.style.base is not None else self.style
def get_data(self): def get_data(self):
return self.data return self.data
@ -227,10 +235,17 @@ class SegmentData(object):
d = self.data.np_data.copy() d = self.data.np_data.copy()
s = self.style.np_data.copy() s = self.style.np_data.copy()
copy = SegmentData(d, s, order=self.order) copy = SegmentData(d, s, order=self.order)
else: elif self.data.base is None:
# if there is no base array, we aren't looking at a slice so we
# must be copying the entire array.
d = self.data.copy() d = self.data.copy()
s = self.style.copy() s = self.style.copy()
copy = SegmentData(d, s)
else:
d = self.data.base.copy()
s = self.style.base.copy()
start, end = self.byte_bounds_offset() start, end = self.byte_bounds_offset()
print "copy: start, end =", start, end
copy = SegmentData(d[start:end], s[start:end]) copy = SegmentData(d[start:end], s[start:end])
return copy return copy
@ -480,6 +495,17 @@ class DefaultSegment(object):
matches = (self.style & style_bits) == style_bits matches = (self.style & style_bits) == style_bits
return self.bool_to_ranges(matches) return self.bool_to_ranges(matches)
def get_comment_locations(self, **kwargs):
style_bits = self.get_style_bits(**kwargs)
r = self.rawdata.copy()
print len(r.style)
print len(r.style_base)
base = r.style_base & style_bits
comment_indexes = np.asarray(self.rawdata.extra.comments.keys(), dtype=np.uint32)
print comment_indexes
base[comment_indexes] |= comment_bit_mask
return r.style
def get_entire_style_ranges(self, split_comments=None, **kwargs): def get_entire_style_ranges(self, split_comments=None, **kwargs):
"""Find sections of the segment that have the same style value. """Find sections of the segment that have the same style value.
@ -492,32 +518,41 @@ class DefaultSegment(object):
tuple; and an integer with the style value. tuple; and an integer with the style value.
""" """
style_bits = self.get_style_bits(**kwargs) style_bits = self.get_style_bits(**kwargs)
matches = self.style & style_bits matches = self.get_comment_locations(**kwargs)
if split_comments is None:
split_comments = set()
else:
split_comments = set(split_comments)
groups = np.split(matches, np.where(np.diff(matches) != 0)[0] + 1) groups = np.split(matches, np.where(np.diff(matches) != 0)[0] + 1)
# print groups
# split into groups with the same numbers # split into groups with the same numbers
ranges = [] ranges = []
last_end = 0 last_end = 0
if len(groups) == 1 and len(groups[0]) == 0: if len(groups) == 1 and len(groups[0]) == 0:
# check for degenerate case # check for degenerate case
return return
last_style = -1
for group in groups: for group in groups:
next_end = last_end + len(group) # each group is guaranteed to have the same style
size = len(group)
next_end = last_end + size
style = matches[last_end] style = matches[last_end]
if style in split_comments: masked_style = style & style_bits
comment_list = self.get_comments_in_range(last_end, next_end) # print last_end, next_end, style, masked_style, size, group
for index in sorted(comment_list.keys()): if style & comment_bit_mask:
if last_end == index: if masked_style in split_comments:
# skip if the comment is at the start point because it # print "interesting comment", last_end, next_end
# will always be split at the start point ranges.append(((last_end, next_end), masked_style))
continue else:
ranges.append(((last_end, index), style)) # print "non-interesting comment", last_end, next_end
last_end = index if last_style == masked_style:
if last_end < next_end: ((prev_end, _), _) = ranges.pop()
ranges.append(((last_end, next_end), style)) ranges.append(((prev_end, next_end), masked_style))
else:
ranges.append(((last_end, next_end), masked_style))
else:
if last_style == masked_style:
((prev_end, _), _) = ranges.pop()
ranges.append(((prev_end, next_end), masked_style))
else:
ranges.append(((last_end, next_end), masked_style))
last_style = masked_style
last_end = next_end last_end = next_end
return ranges return ranges

View File

@ -3,7 +3,7 @@ import os
import numpy as np import numpy as np
import pytest import pytest
from atrcopy import DefaultSegment, SegmentData, get_xex, interleave_segments from atrcopy import DefaultSegment, SegmentData, get_xex, interleave_segments, user_bit_mask
def get_indexed(segment, num, scale): def get_indexed(segment, num, scale):
@ -181,6 +181,87 @@ class TestIndexed(object):
assert not np.all((c.data[:] - s.data[:]) == 0) assert not np.all((c.data[:] - s.data[:]) == 0)
class TestComments(object):
def setup(self):
data = np.ones([4000], dtype=np.uint8)
r = SegmentData(data)
self.segment = DefaultSegment(r, 0)
self.sub_segment = DefaultSegment(r[2:202], 2)
def test_locations(self):
s = self.segment
s.set_comment([[4,5]], "test1")
s.set_comment([[40,50]], "test2")
s.set_style_ranges([[2,100]], comment=True)
s.set_style_ranges([[200, 299]], data=True)
for i in range(1,4):
for j in range(1, 4):
# create some with overlapping regions, some without
r = [500*j, 500*j + 200*i + 200]
s.set_style_ranges([r], user=i)
s.set_user_data([r], i, i*10 + j)
r = [100, 200]
s.set_style_ranges([r], user=4)
s.set_user_data([r], 4, 99)
r = [3100, 3200]
s.set_style_ranges([r], user=4)
s.set_user_data([r], 4, 99)
s2 = self.sub_segment
print len(s2)
copy = s2.get_comment_locations()
print copy
# comments at 4 and 40 in the original means 2 and 38 in the copy
orig = s.get_comment_locations()
assert copy[2] == orig[4]
assert copy[28] == orig[38]
def test_split_data_at_comment(self):
s = self.segment
s.set_style_ranges([[0,1000]], data=True)
for i in range(0, len(s), 25):
s.set_comment([[i,i+1]], "comment at %d" % i)
s2 = self.sub_segment
print len(s2)
copy = s2.get_comment_locations()
print copy
# comments at 4 and 40 in the original means 2 and 38 in the copy
orig = s.get_comment_locations()
print orig[0:200]
assert copy[2] == orig[4]
assert copy[28] == orig[38]
r = s2.get_entire_style_ranges([1], user=True)
print r
assert r == [((0, 23), 1), ((23, 48), 1), ((48, 73), 1), ((73, 98), 1), ((98, 123), 1), ((123, 148), 1), ((148, 173), 1), ((173, 198), 1), ((198, 200), 1)]
def test_split_data_at_comment2(self):
s = self.segment
start = 0
i = 0
for end in range(40, 1000, 40):
s.set_style_ranges([[start, end]], user=i)
start = end
i = (i + 1) % 8
for i in range(0, len(s), 25):
s.set_comment([[i,i+1]], "comment at %d" % i)
s2 = self.sub_segment
print len(s2)
copy = s2.get_comment_locations()
print copy
# comments at 4 and 40 in the original means 2 and 38 in the copy
orig = s.get_comment_locations()
print orig[0:200]
assert copy[2] == orig[4]
assert copy[28] == orig[38]
r = s2.get_entire_style_ranges([1], user=user_bit_mask)
print r
assert r == [((0, 38), 0), ((38, 48), 1), ((48, 73), 1), ((73, 78), 1), ((78, 118), 2), ((118, 158), 3), ((158, 198), 4), ((198, 200), 5)]
if __name__ == "__main__": if __name__ == "__main__":
t = TestIndexed() t = TestIndexed()
t.setup() t.setup()
@ -191,3 +272,6 @@ if __name__ == "__main__":
t.setup() t.setup()
t.test_xex() t.test_xex()
t.test_copy() t.test_copy()
t = TestComments()
t.setup()
t.test_split_data_at_comment()