Added splitting-on-comments when using get_entire_style_ranges, refs: https://github.com/robmcmullen/omnivore/issues/117

2025-01-16 02:32:52 +00:00 · 2017-03-12 14:16:08 -07:00 · 2017-03-12 14:16:08 -07:00 · 476f0cd568
commit 476f0cd568
parent 5f9acaa802
1 changed files with 32 additions and 2 deletions
--- a/atrcopy/segments.py
+++ b/atrcopy/segments.py
@ -464,7 +464,7 @@ class DefaultSegment(object):
        matches = (self.style & style_bits) == style_bits
        return self.bool_to_ranges(matches)
-    def get_entire_style_ranges(self, **kwargs):
+    def get_entire_style_ranges(self, split_comments=None, **kwargs):
        """Find sections of the segment that have the same style value.
        The arguments to this function are used as a mask for the style to
@ -477,6 +477,10 @@ class DefaultSegment(object):
        """
        style_bits = self.get_style_bits(**kwargs)
        matches = self.style & style_bits
        if split_comments is None:
            split_comments = set()
        else:
            split_comments = set(split_comments)
        groups = np.split(matches, np.where(np.diff(matches) != 0)[0] + 1)
        # split into groups with the same numbers
        ranges = []
@ -486,7 +490,18 @@ class DefaultSegment(object):
            return
        for group in groups:
            next_end = last_end + len(group)
-            ranges.append(((last_end, next_end), matches[last_end]))
+            style = matches[last_end]
            if style in split_comments:
                comment_list = self.get_comments_in_range(last_end, next_end)
                for index in sorted(comment_list.keys()):
                    if last_end == index:
                        # skip if the comment is at the start point because it
                        # will always be split at the start point
                        continue
                    ranges.append(((last_end, index), style))
                    last_end = index
            if last_end < next_end:
                ranges.append(((last_end, next_end), style))
            last_end = next_end
        return ranges
@ -655,6 +670,21 @@ class DefaultSegment(object):
            comments.append(comment)
        return has_comments, comments
    def get_comments_in_range(self, start, end):
        """Get a list of comments at specified indexes"""
        comments = {}
        # Naive way, but maybe it's fast enough: loop over all comments
        # gathering those within the bounds
        for rawindex, comment in self.rawdata.extra.comments.iteritems():
            try:
                index = self.get_index_from_base_index(rawindex)
            except IndexError:
                continue
            if index >= start and index < end:
                comments[index] = comment
        return comments
    def get_nonblank_comments_at_indexes(self, indexes):
        """Get a list of comments at specified indexes, but if blank, search
        backward in that comment block to find the first index which should