Enclose text in a specific tag [duplicate]

I am looking for a simple way to enclose text from inside html tag in a another tag.

Let's assume I have html file saved locally and I need to enclose any text inside any tag in <span class="text">.

Example:

Input html file:

...
  <p>
    Hello there.
    <b>
      General Kenobi!
    </b>
    You are a bold one.
  </p>
...

Output html file:

...
  <p>
    <span class="text">Hello there.</span>
    <b>
      <span class="text">General Kenobi!</span>
    </b>
    <span class="text">You are a bold one.</span>
  </p>
...

What would be the simplest way to this in Python?


Solution 1:

I think I implemented it and tested it a little bit, but it's still not compatible with some cases.

When there is no gap between tags, for example: <p></p>, </p><p>

Perhaps, it has no effect on your current use. You can try it.

My idea is to first get all the tags, then filter to get valid tag pairs, format and replace the corresponding location data.

class Converter:
    
    fmt = '<span class="text">{}</span>'
    # tag matching rule
    tag_regex_rule = re.compile("(?<=<)([^/]*?)(?=>)", re.S)

    def __init__(self, source_data: str):
        self.source_data = source_data

    def convert(self):
        tags, tag_index_list = self._get_tag_info()
        return self._replace(tags, tag_index_list)

    def _get_tag_info(self):
        """Extract the tags in the data"""
        source_data = self.source_data
        split_data = re.split("</?[^/><]+?>", source_data)

        tags = []
        tag_index_list = []  # [[start_index, end_index], ...]
        prev_index = 0
        for index, data in enumerate(split_data):
            # Offset of the first position of the current string
            current_index = source_data.find(data, prev_index)
            # End position offset
            current_offset_index = current_index + len(data)
            tag = source_data[prev_index: current_index]
            tags.append(tag)
            tag_index_list.append((prev_index, current_index))
            prev_index = current_offset_index

        return tags, tag_index_list

    @staticmethod
    def list_index(_list: list, val, _start: int = 0) -> int:
        """Find the index position of a value from the list, return -1 if it does not exist"""
        try:
            return _list.index(val, _start)
        except ValueError:
            return -1

    def _find_tag_pairs(self, offset_index, tags, left_tag, right_tag):
        """Find the right tag based on the left tag"""
        if (r_index := self.list_index(tags, right_tag, _start=offset_index)) == -1:
            return

        for i in range(tags[offset_index: r_index].count(left_tag)):
            offset_index = r_index + 1
            # Break when there is no right tag
            if (current_r_index := self.list_index(tags, right_tag, _start=offset_index)) == -1:
                break
            # Break when there is a left tag in the middle
            if tags[offset_index: current_r_index].count(left_tag):
                break
            r_index = current_r_index
        return r_index

    def _parse(self, tags, tag_pairs_tree, child_index_offset=0):
        """Parse the tag pairs and construct the data format"""
        max_index = len(tags)
        index = 0
        while True:
            if index >= max_index:
                break

            tag = tags[index]
            # The tag format is correct and the corresponding right tag is obtained
            if (right_tag := self.tag_regex_rule.sub(r"/\g<1>", tag)) == tag:
                index += 1
                continue

            offset_index = (tag_pairs_tree[-1][-1] if tag_pairs_tree else index) + 1
            if (r_index := self._find_tag_pairs(offset_index, tags, tag, right_tag)) is None:
                index += 1
                continue

            _child_tag_pairs_tree = []
            self._parse(
                tags[index + 1: r_index], _child_tag_pairs_tree, child_index_offset=index + 1 + child_index_offset
            )
            tag_pairs_tree.append(
                [index + child_index_offset, *chain(*_child_tag_pairs_tree), r_index + child_index_offset]
            )
            index = r_index + 1

    def _replace(self, tags, tag_index_list):
        """Replace data"""
        tag_pairs_tree = []
        self._parse(tags, tag_pairs_tree)
        source_data = self.source_data

        replace_list = []
        prev_tag_index = 0
        for tag_index in chain(*tag_pairs_tree):
            offset = tag_index_list[prev_tag_index][1]
            val: str = source_data[offset: tag_index_list[tag_index][0]]
            if content := val.strip():
                s_index = val.index(content) + offset
                e_index = s_index + len(content)
                replace_list.append((self.fmt.format(content), s_index, e_index))
            prev_tag_index = tag_index

        inp_list = []
        prev_index = 0
        for replace_val, s_index, e_index in replace_list:
            inp_list.append(source_data[prev_index: s_index])
            inp_list.append(replace_val)
            prev_index = e_index
        else:
            inp_list.append(source_data[prev_index:])

        return "".join(inp_list)


if __name__ == '__main__':
    inp = """
      <p>
        Hello there. <a>
        <p>
            <b>
              General Kenobi!
            </b>
        </p>
      </p>
        You are a bold one.</p>
      <p>
      </p>
    """
    print(Converter(inp).convert())
Output:
<p>
    <span class="text">Hello there. <a></span>
    <p>
        <b>
          <span class="text">General Kenobi!</span>
        </b>
    </p>
</p>
    <span class="text">You are a bold one.</p></span>
<p>
</p>