Enclose text in a specific tag [duplicate]
I am looking for a simple way to enclose text from inside html tag in a another tag.
Let's assume I have html file saved locally and I need to enclose any text inside any tag in <span class="text">
.
Example:
Input html file:
...
<p>
Hello there.
<b>
General Kenobi!
</b>
You are a bold one.
</p>
...
Output html file:
...
<p>
<span class="text">Hello there.</span>
<b>
<span class="text">General Kenobi!</span>
</b>
<span class="text">You are a bold one.</span>
</p>
...
What would be the simplest way to this in Python?
Solution 1:
I think I implemented it and tested it a little bit, but it's still not compatible with some cases.
When there is no gap between tags, for example:
<p></p>
,</p><p>
Perhaps, it has no effect on your current use. You can try it.
My idea is to first get all the tags, then filter to get valid tag pairs, format and replace the corresponding location data.
class Converter:
fmt = '<span class="text">{}</span>'
# tag matching rule
tag_regex_rule = re.compile("(?<=<)([^/]*?)(?=>)", re.S)
def __init__(self, source_data: str):
self.source_data = source_data
def convert(self):
tags, tag_index_list = self._get_tag_info()
return self._replace(tags, tag_index_list)
def _get_tag_info(self):
"""Extract the tags in the data"""
source_data = self.source_data
split_data = re.split("</?[^/><]+?>", source_data)
tags = []
tag_index_list = [] # [[start_index, end_index], ...]
prev_index = 0
for index, data in enumerate(split_data):
# Offset of the first position of the current string
current_index = source_data.find(data, prev_index)
# End position offset
current_offset_index = current_index + len(data)
tag = source_data[prev_index: current_index]
tags.append(tag)
tag_index_list.append((prev_index, current_index))
prev_index = current_offset_index
return tags, tag_index_list
@staticmethod
def list_index(_list: list, val, _start: int = 0) -> int:
"""Find the index position of a value from the list, return -1 if it does not exist"""
try:
return _list.index(val, _start)
except ValueError:
return -1
def _find_tag_pairs(self, offset_index, tags, left_tag, right_tag):
"""Find the right tag based on the left tag"""
if (r_index := self.list_index(tags, right_tag, _start=offset_index)) == -1:
return
for i in range(tags[offset_index: r_index].count(left_tag)):
offset_index = r_index + 1
# Break when there is no right tag
if (current_r_index := self.list_index(tags, right_tag, _start=offset_index)) == -1:
break
# Break when there is a left tag in the middle
if tags[offset_index: current_r_index].count(left_tag):
break
r_index = current_r_index
return r_index
def _parse(self, tags, tag_pairs_tree, child_index_offset=0):
"""Parse the tag pairs and construct the data format"""
max_index = len(tags)
index = 0
while True:
if index >= max_index:
break
tag = tags[index]
# The tag format is correct and the corresponding right tag is obtained
if (right_tag := self.tag_regex_rule.sub(r"/\g<1>", tag)) == tag:
index += 1
continue
offset_index = (tag_pairs_tree[-1][-1] if tag_pairs_tree else index) + 1
if (r_index := self._find_tag_pairs(offset_index, tags, tag, right_tag)) is None:
index += 1
continue
_child_tag_pairs_tree = []
self._parse(
tags[index + 1: r_index], _child_tag_pairs_tree, child_index_offset=index + 1 + child_index_offset
)
tag_pairs_tree.append(
[index + child_index_offset, *chain(*_child_tag_pairs_tree), r_index + child_index_offset]
)
index = r_index + 1
def _replace(self, tags, tag_index_list):
"""Replace data"""
tag_pairs_tree = []
self._parse(tags, tag_pairs_tree)
source_data = self.source_data
replace_list = []
prev_tag_index = 0
for tag_index in chain(*tag_pairs_tree):
offset = tag_index_list[prev_tag_index][1]
val: str = source_data[offset: tag_index_list[tag_index][0]]
if content := val.strip():
s_index = val.index(content) + offset
e_index = s_index + len(content)
replace_list.append((self.fmt.format(content), s_index, e_index))
prev_tag_index = tag_index
inp_list = []
prev_index = 0
for replace_val, s_index, e_index in replace_list:
inp_list.append(source_data[prev_index: s_index])
inp_list.append(replace_val)
prev_index = e_index
else:
inp_list.append(source_data[prev_index:])
return "".join(inp_list)
if __name__ == '__main__':
inp = """
<p>
Hello there. <a>
<p>
<b>
General Kenobi!
</b>
</p>
</p>
You are a bold one.</p>
<p>
</p>
"""
print(Converter(inp).convert())
Output:
<p>
<span class="text">Hello there. <a></span>
<p>
<b>
<span class="text">General Kenobi!</span>
</b>
</p>
</p>
<span class="text">You are a bold one.</p></span>
<p>
</p>