Find character occurance percentage from a List of words

Solution 1:

You can use collections' Counter for this, and then divide by the total number of characters:

from collections import Counter

word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']

# Combine all words together
chars = "".join(word_list)
n_chars = len(chars)
# Count all characters
counter = Counter(chars)
# Get frequency and convert frequency into percentages
occ_pct = [(char, occ / n_chars * 100) for char, occ in counter.most_common()]
print(occ_pct)
[('E', 13.58936484490399), ('T', 11.669128508124077), ('I', 7.828655834564254), ('A', 7.828655834564254), ('S', 6.794682422451995), ('O', 6.3515509601181686), ('N', 6.20384047267356), ('R', 4.874446085672083), ('L', 4.874446085672083), ('H', 4.579025110782865), ('P', 3.2496307237813884), ('B', 3.10192023633678), ('U', 3.10192023633678), ('Y', 2.511078286558346), ('C', 2.511078286558346), ('D', 2.511078286558346), ('M', 2.363367799113737), ('F', 1.7725258493353029), ('G', 1.6248153618906942), ('X', 0.8862629246676514), ('V', 0.7385524372230428), ('W', 0.5908419497784343), ('K', 0.29542097488921715), ('Z', 0.14771048744460857)]

This can easily be printed out nicely:

for occ, pct in occ_pct:
    print(occ, f"{pct:.2f}%")
E 13.59%
T 11.67%
I 7.83%
A 7.83%
S 6.79%
O 6.35%
N 6.20%
R 4.87%
L 4.87%
H 4.58%
P 3.25%
B 3.10%
U 3.10%
Y 2.51%
C 2.51%
D 2.51%
M 2.36%
F 1.77%
G 1.62%
X 0.89%
V 0.74%
W 0.59%
K 0.30%
Z 0.15%

Or sorted alphabetically by character instead:

for occ, pct in sorted(occ_pct, key=lambda x: x[0]):
    print(occ, f"{pct:.2f}%")
A 7.83%
B 3.10%
C 2.51%
D 2.51%
E 13.59%
F 1.77%
G 1.62%
H 4.58%
I 7.83%
K 0.30%
L 4.87%
M 2.36%
N 6.20%
O 6.35%
P 3.25%
R 4.87%
S 6.79%
T 11.67%
U 3.10%
V 0.74%
W 0.59%
X 0.89%
Y 2.51%
Z 0.15%

Edit: As requested, without collections:

# Mapping from character to usage frequency
usage = {}
# The total number of characters
total = 0
for word in word_list:
    for char in word:
        # Add 1 to the value corresponding to the `char` key.
        # (and set to 1 if it doesn't exist yet)
        usage[char] = usage.get(char, 0) + 1
        total += 1
# usage.items() returns a list of characters/occurrences, so we can use
# that alongside `total` to compute percentages.
occ_pct = [(char, occ / total * 100) for char, occ in usage.items()]

Solution 2:

word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']

word_str = ''.join(word_list)
word_length = len(word_str)

character_count = dict()
for character in word_str:
    if character in character_count:
        # character has occured at least once already  
        character_count[character] += 1
    else:
        # character occurs for the first time
        character_count[character] = 1

for character, count in sorted(character_count.items()):
    print(f'{character}: {round(count / word_length * 100, 2)}%')

This is a fairly straight-forward solution. First, I make a single string out of your list of strings to simply things a bit and the individual strings do not appear to be necessary for this. ''.join(word_list) concatenates the strings with an empty string between them.

word_str -> 'THEZENOFPYTHON...'

The total number of characters len(word_str) is necessary to calculate the percentage occurance of a character.

There are probably more elegant ways to count the characters, but I find a dictionary (character_count) easier to read/explain.

In the for-loop, I check if a counter for a character already exists in the dictionary. If it does, increment the counter. If it doesn't, initiate the counter with the value 1.

Now, the dictionary contains a count of every occuring character:

character_count -> {
    'T': 10,
    'H': 5,
    ...
}

Next, to print the values the way you want, we can iterate over the keys and values of the dictionary with character_count.items(). However, use sorted() to make them appear in lexicographic order.

I calculate the percentage directly in the formatted string. count / word_length would be something like 0.018912347, so I multiply by 100 and then use round( ... , 2) to only display up to two decimal digits.

Solution 3:

You can use dictionary to count characters in all words and count character. then go through whole dict and calculate percentage of character.

word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE'] 



character_list = []
printable =''
character_count = 0
char_dict = {}
per_dict = {}
for word in word_list:
   for character in word:
      character_count=character_count+1
      if(character in char_dict):
         char_dict[character]=char_dict[character]+1
      else:
         char_dict[character]=1

for k in char_dict.keys():
    per_dict[k]=round((char_dict[k]/character_count)*100,2)

print(per_dict)