Convert unstructured data into a Python Dictionary

Solution 1:

Well, that was a bit more complicated than anticipated - but this solution does what you say you need, although it's a bit different from what you started out with:

from typing import Any, List, TextIO, Optional, Tuple
from io import StringIO

sample = StringIO("""main sub_main sub_main_1
 AAA A-ABC ABC
 AAA A-DEF A-DEF-GHI GHI
main sub_main sub_main_2
 BBB B-ABC ABC
 BBB B-DEF DEF
 BBB B-X B-Y B-Z ""
main sub_main sub_main_3
 CCC C-ABC  ABC
 CCC C-X C-Y C-Z ""
 CCC C-PQR C-STU 2
  C-LMN C-OPQ C-RST ""
 CCC C-DEF C-DEF-GHI ""
 CCC C-DEF C-DEF-JKL C-MNO 1
  C-XYZ ""
main sub_main sub_main_4
 DDD D-ABC  DEF
 DDD D-PQR  STU
main sub_main sub_main_5
 EEE E-ABC DEF
 EEE E-PQR STU
main sub_main sub_main_6
 FFF F-ABC  F-DEF
 FFF F-PQR  F-STU""")


def _dig(d: dict, keys: List[str], value: Any):
    """
    returns a copy of d, recursively updated with value using nested list of string keys
    """
    return d | {
        keys[0]: (
            _dig({}, keys[1:], value) if keys[0] not in d else _dig(d[keys[0]], keys[1:], value)
        ) if len(keys) > 1 else (value if value != '""' else '')}


def _data_to_dict(fp: TextIO, next_line: Optional[Tuple[int, str]], process_line: Optional[Tuple[int, str]], level: int):
    result = {}
    while True:
        # if there's no line to process, process next_line and load a new next_line
        if process_line is None:
            process_line = next_line
            try:
                line = next(fp)
                next_line = len(line) - len(line.lstrip()), [key for key in line.strip().split() if key]
            except StopIteration:
                # if no next_line could be read, done if process_line is None as well
                if process_line is None:
                    return next_line, result
                # otherwise, continue with next_line = None
                next_line = None
        else:
            # if the line to process is at the same or deeper level as the next line
            if next_line is None or process_line[0] >= next_line[0]:
                result = _dig(result, process_line[1][:-1], process_line[1][-1])
                if next_line is None or process_line[0] > next_line[0]:
                    return next_line, result
            else:  # prev_line[0] < line[0]
                next_line, sub = _data_to_dict(fp, next_line, None, level + 1)
                result = _dig(result, process_line[1][:-2] + [f'{process_line[1][-2]} {process_line[1][-1]}'], sub)
                if next_line is not None and next_line[0] < level:
                    return next_line, result
            process_line = None


def data_to_dict(fp: TextIO):
    __, result = _data_to_dict(fp, None, None, 0)
    return result


# operating on StringIO here, would work with open text file as well
print(data_to_dict(sample))

It doesn't pretty-print the dictionary, but you'll find it matches the structure you require.

In previous versions of Python, replace _dig, the | operator was added in 3.9.0:

def _dig(d: dict, keys: List[str], value: Any):
    """
    returns a copy of d, recursively updated with value using nested list of string keys
    """
    return {**d, **{
        keys[0]: (
            _dig({}, keys[1:], value) if keys[0] not in d else _dig(d[keys[0]], keys[1:], value)
        ) if len(keys) > 1 else (value if value != '""' else '')}}

I tested the same code with this updated _dig on 3.6 and that works. If you're using even older versions of Python, I strongly suggest updating (or being very clear in your questions that you're using a very outdated version of Python).