Find character occurance percentage from a List of words
Solution 1:
You can use collections
' Counter
for this, and then divide by the total number of characters:
from collections import Counter
word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']
# Combine all words together
chars = "".join(word_list)
n_chars = len(chars)
# Count all characters
counter = Counter(chars)
# Get frequency and convert frequency into percentages
occ_pct = [(char, occ / n_chars * 100) for char, occ in counter.most_common()]
print(occ_pct)
[('E', 13.58936484490399), ('T', 11.669128508124077), ('I', 7.828655834564254), ('A', 7.828655834564254), ('S', 6.794682422451995), ('O', 6.3515509601181686), ('N', 6.20384047267356), ('R', 4.874446085672083), ('L', 4.874446085672083), ('H', 4.579025110782865), ('P', 3.2496307237813884), ('B', 3.10192023633678), ('U', 3.10192023633678), ('Y', 2.511078286558346), ('C', 2.511078286558346), ('D', 2.511078286558346), ('M', 2.363367799113737), ('F', 1.7725258493353029), ('G', 1.6248153618906942), ('X', 0.8862629246676514), ('V', 0.7385524372230428), ('W', 0.5908419497784343), ('K', 0.29542097488921715), ('Z', 0.14771048744460857)]
This can easily be printed out nicely:
for occ, pct in occ_pct:
print(occ, f"{pct:.2f}%")
E 13.59%
T 11.67%
I 7.83%
A 7.83%
S 6.79%
O 6.35%
N 6.20%
R 4.87%
L 4.87%
H 4.58%
P 3.25%
B 3.10%
U 3.10%
Y 2.51%
C 2.51%
D 2.51%
M 2.36%
F 1.77%
G 1.62%
X 0.89%
V 0.74%
W 0.59%
K 0.30%
Z 0.15%
Or sorted alphabetically by character instead:
for occ, pct in sorted(occ_pct, key=lambda x: x[0]):
print(occ, f"{pct:.2f}%")
A 7.83%
B 3.10%
C 2.51%
D 2.51%
E 13.59%
F 1.77%
G 1.62%
H 4.58%
I 7.83%
K 0.30%
L 4.87%
M 2.36%
N 6.20%
O 6.35%
P 3.25%
R 4.87%
S 6.79%
T 11.67%
U 3.10%
V 0.74%
W 0.59%
X 0.89%
Y 2.51%
Z 0.15%
Edit:
As requested, without collections
:
# Mapping from character to usage frequency
usage = {}
# The total number of characters
total = 0
for word in word_list:
for char in word:
# Add 1 to the value corresponding to the `char` key.
# (and set to 1 if it doesn't exist yet)
usage[char] = usage.get(char, 0) + 1
total += 1
# usage.items() returns a list of characters/occurrences, so we can use
# that alongside `total` to compute percentages.
occ_pct = [(char, occ / total * 100) for char, occ in usage.items()]
Solution 2:
word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']
word_str = ''.join(word_list)
word_length = len(word_str)
character_count = dict()
for character in word_str:
if character in character_count:
# character has occured at least once already
character_count[character] += 1
else:
# character occurs for the first time
character_count[character] = 1
for character, count in sorted(character_count.items()):
print(f'{character}: {round(count / word_length * 100, 2)}%')
This is a fairly straight-forward solution. First, I make a single string out of your list of strings to simply things a bit and the individual strings do not appear to be necessary for this. ''.join(word_list)
concatenates the strings with an empty string between them.
word_str -> 'THEZENOFPYTHON...'
The total number of characters len(word_str)
is necessary to calculate the percentage occurance of a character.
There are probably more elegant ways to count the characters, but I find a dictionary (character_count
) easier to read/explain.
In the for-loop, I check if a counter for a character already exists in the dictionary. If it does, increment the counter. If it doesn't, initiate the counter with the value 1.
Now, the dictionary contains a count of every occuring character:
character_count -> {
'T': 10,
'H': 5,
...
}
Next, to print the values the way you want, we can iterate over the keys and values of the dictionary with character_count.items()
. However, use sorted()
to make them appear in lexicographic order.
I calculate the percentage directly in the formatted string. count / word_length
would be something like 0.018912347
, so I multiply by 100 and then use round( ... , 2)
to only display up to two decimal digits.
Solution 3:
You can use dictionary to count characters in all words and count character. then go through whole dict and calculate percentage of character.
word_list = ['THE', 'ZEN', 'OF', 'PYTHON', 'BY', 'TIM', 'PETERS', 'BEAUTIFUL', 'IS', 'BETTER', 'THAN', 'UGLY', 'EXPLICIT', 'IS', 'BETTER', 'THAN', 'IMPLICIT', 'SIMPLE', 'IS', 'BETTER', 'THAN', 'COMPLEX', 'COMPLEX', 'IS', 'BETTER', 'THAN', 'COMPLICATED', 'FLAT', 'IS', 'BETTER', 'THAN', 'NESTED', 'SPARSE', 'IS', 'BETTER', 'THAN', 'DENSE', 'READABILITY', 'COUNTS', 'SPECIAL', 'CASES', 'ARENT', 'SPECIAL', 'ENOUGH', 'TO', 'BREAK', 'THE', 'RULES', 'ALTHOUGH', 'PRACTICALITY', 'BEATS', 'PURITY', 'ERRORS', 'SHOULD', 'NEVER', 'PASS', 'SILENTLY', 'UNLESS', 'EXPLICITLY', 'SILENCED', 'IN', 'THE', 'FACE', 'OF', 'AMBIGUITY', 'REFUSE', 'THE', 'TEMPTATION', 'TO', 'GUESS', 'THERE', 'SHOULD', 'BE', 'ONE', 'AND', 'PREFERABLY', 'ONLY', 'ONE', 'OBVIOUS', 'WAY', 'TO', 'DO', 'IT', 'ALTHOUGH', 'THAT', 'WAY', 'MAY', 'NOT', 'BE', 'OBVIOUS', 'AT', 'FIRST', 'UNLESS', 'YOURE', 'DUTCH', 'NOW', 'IS', 'BETTER', 'THAN', 'NEVER', 'ALTHOUGH', 'NEVER', 'IS', 'OFTEN', 'BETTER', 'THAN', 'RIGHT', 'NOW', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'HARD', 'TO', 'EXPLAIN', 'ITS', 'A', 'BAD', 'IDEA', 'IF', 'THE', 'IMPLEMENTATION', 'IS', 'EASY', 'TO', 'EXPLAIN', 'IT', 'MAY', 'BE', 'A', 'GOOD', 'IDEA', 'NAMESPACES', 'ARE', 'ONE', 'HONKING', 'GREAT', 'IDEA', '', 'LETS', 'DO', 'MORE', 'OF', 'THOSE']
character_list = []
printable =''
character_count = 0
char_dict = {}
per_dict = {}
for word in word_list:
for character in word:
character_count=character_count+1
if(character in char_dict):
char_dict[character]=char_dict[character]+1
else:
char_dict[character]=1
for k in char_dict.keys():
per_dict[k]=round((char_dict[k]/character_count)*100,2)
print(per_dict)