Added splitting-on-comments when using get_entire_style_ranges, refs: https://github.com/robmcmullen/omnivore/issues/117

This commit is contained in:
Rob McMullen 2017-03-12 14:16:08 -07:00
parent 5f9acaa802
commit 476f0cd568

View File

@ -464,7 +464,7 @@ class DefaultSegment(object):
matches = (self.style & style_bits) == style_bits matches = (self.style & style_bits) == style_bits
return self.bool_to_ranges(matches) return self.bool_to_ranges(matches)
def get_entire_style_ranges(self, **kwargs): def get_entire_style_ranges(self, split_comments=None, **kwargs):
"""Find sections of the segment that have the same style value. """Find sections of the segment that have the same style value.
The arguments to this function are used as a mask for the style to The arguments to this function are used as a mask for the style to
@ -477,6 +477,10 @@ class DefaultSegment(object):
""" """
style_bits = self.get_style_bits(**kwargs) style_bits = self.get_style_bits(**kwargs)
matches = self.style & style_bits matches = self.style & style_bits
if split_comments is None:
split_comments = set()
else:
split_comments = set(split_comments)
groups = np.split(matches, np.where(np.diff(matches) != 0)[0] + 1) groups = np.split(matches, np.where(np.diff(matches) != 0)[0] + 1)
# split into groups with the same numbers # split into groups with the same numbers
ranges = [] ranges = []
@ -486,7 +490,18 @@ class DefaultSegment(object):
return return
for group in groups: for group in groups:
next_end = last_end + len(group) next_end = last_end + len(group)
ranges.append(((last_end, next_end), matches[last_end])) style = matches[last_end]
if style in split_comments:
comment_list = self.get_comments_in_range(last_end, next_end)
for index in sorted(comment_list.keys()):
if last_end == index:
# skip if the comment is at the start point because it
# will always be split at the start point
continue
ranges.append(((last_end, index), style))
last_end = index
if last_end < next_end:
ranges.append(((last_end, next_end), style))
last_end = next_end last_end = next_end
return ranges return ranges
@ -655,6 +670,21 @@ class DefaultSegment(object):
comments.append(comment) comments.append(comment)
return has_comments, comments return has_comments, comments
def get_comments_in_range(self, start, end):
"""Get a list of comments at specified indexes"""
comments = {}
# Naive way, but maybe it's fast enough: loop over all comments
# gathering those within the bounds
for rawindex, comment in self.rawdata.extra.comments.iteritems():
try:
index = self.get_index_from_base_index(rawindex)
except IndexError:
continue
if index >= start and index < end:
comments[index] = comment
return comments
def get_nonblank_comments_at_indexes(self, indexes): def get_nonblank_comments_at_indexes(self, indexes):
"""Get a list of comments at specified indexes, but if blank, search """Get a list of comments at specified indexes, but if blank, search
backward in that comment block to find the first index which should backward in that comment block to find the first index which should