Better way to convert file sizes in Python [closed]
Here is what I use:
import math
def convert_size(size_bytes):
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])
NB : size should be sent in Bytes.
There is hurry.filesize that will take the size in bytes and make a nice string out if it.
>>> from hurry.filesize import size
>>> size(11000)
'10K'
>>> size(198283722)
'189M'
Or if you want 1K == 1000 (which is what most users assume):
>>> from hurry.filesize import size, si
>>> size(11000, system=si)
'11K'
>>> size(198283722, system=si)
'198M'
It has IEC support as well (but that wasn't documented):
>>> from hurry.filesize import size, iec
>>> size(11000, system=iec)
'10Ki'
>>> size(198283722, system=iec)
'189Mi'
Because it's written by the Awesome Martijn Faassen, the code is small, clear and extensible. Writing your own systems is dead easy.
Here is one:
mysystem = [
(1024 ** 5, ' Megamanys'),
(1024 ** 4, ' Lotses'),
(1024 ** 3, ' Tons'),
(1024 ** 2, ' Heaps'),
(1024 ** 1, ' Bunches'),
(1024 ** 0, ' Thingies'),
]
Used like so:
>>> from hurry.filesize import size
>>> size(11000, system=mysystem)
'10 Bunches'
>>> size(198283722, system=mysystem)
'189 Heaps'
Instead of a size divisor of 1024 * 1024
you could use the <<
bitwise shifting operator, i.e. 1<<20
to get megabytes, 1<<30
to get gigabytes, etc.
In the simplest scenario you can have e.g. a constant MBFACTOR = float(1<<20)
which can then be used with bytes, i.e.: megas = size_in_bytes/MBFACTOR
.
Megabytes are usually all that you need, or otherwise something like this can be used:
# bytes pretty-printing
UNITS_MAPPING = [
(1<<50, ' PB'),
(1<<40, ' TB'),
(1<<30, ' GB'),
(1<<20, ' MB'),
(1<<10, ' KB'),
(1, (' byte', ' bytes')),
]
def pretty_size(bytes, units=UNITS_MAPPING):
"""Get human-readable file sizes.
simplified version of https://pypi.python.org/pypi/hurry.filesize/
"""
for factor, suffix in units:
if bytes >= factor:
break
amount = int(bytes / factor)
if isinstance(suffix, tuple):
singular, multiple = suffix
if amount == 1:
suffix = singular
else:
suffix = multiple
return str(amount) + suffix
print(pretty_size(1))
print(pretty_size(42))
print(pretty_size(4096))
print(pretty_size(238048577))
print(pretty_size(334073741824))
print(pretty_size(96995116277763))
print(pretty_size(3125899904842624))
## [Out] ###########################
1 byte
42 bytes
4 KB
227 MB
311 GB
88 TB
2 PB
Here are some easy-to-copy one liners to use if you already know what unit size you want. If you're looking for in a more generic function with a few nice options, see my FEB 2021 update further on...
Bytes
print(f"{os.path.getsize(filepath):,} B")
Kilobits
print(f"{os.path.getsize(filepath)/float(1<<7):,.0f} kb")
Kilobytes
print(f"{os.path.getsize(filepath)/float(1<<10):,.0f} KB")
Megabits
print(f"{os.path.getsize(filepath)/float(1<<17):,.0f} mb")
Megabytes
print(f"{os.path.getsize(filepath)/float(1<<20):,.0f} MB")
Gigabits
print(f"{os.path.getsize(filepath)/float(1<<27):,.0f} gb")
Gigabytes
print(f"{os.path.getsize(filepath)/float(1<<30):,.0f} GB")
Terabytes
print(f"{os.path.getsize(filepath)/float(1<<40):,.0f} TB")
UPDATE FEB 2021 Here are my updated and fleshed-out functions to a) get file/folder size, b) convert into desired units:
from pathlib import Path
def get_path_size(path = Path('.'), recursive=False):
"""
Gets file size, or total directory size
Parameters
----------
path: str | pathlib.Path
File path or directory/folder path
recursive: bool
True -> use .rglob i.e. include nested files and directories
False -> use .glob i.e. only process current directory/folder
Returns
-------
int:
File size or recursive directory size in bytes
Use cleverutils.format_bytes to convert to other units e.g. MB
"""
path = Path(path)
if path.is_file():
size = path.stat().st_size
elif path.is_dir():
path_glob = path.rglob('*.*') if recursive else path.glob('*.*')
size = sum(file.stat().st_size for file in path_glob)
return size
def format_bytes(bytes, unit, SI=False):
"""
Converts bytes to common units such as kb, kib, KB, mb, mib, MB
Parameters
---------
bytes: int
Number of bytes to be converted
unit: str
Desired unit of measure for output
SI: bool
True -> Use SI standard e.g. KB = 1000 bytes
False -> Use JEDEC standard e.g. KB = 1024 bytes
Returns
-------
str:
E.g. "7 MiB" where MiB is the original unit abbreviation supplied
"""
if unit.lower() in "b bit bits".split():
return f"{bytes*8} {unit}"
unitN = unit[0].upper()+unit[1:].replace("s","") # Normalised
reference = {"Kb Kib Kibibit Kilobit": (7, 1),
"KB KiB Kibibyte Kilobyte": (10, 1),
"Mb Mib Mebibit Megabit": (17, 2),
"MB MiB Mebibyte Megabyte": (20, 2),
"Gb Gib Gibibit Gigabit": (27, 3),
"GB GiB Gibibyte Gigabyte": (30, 3),
"Tb Tib Tebibit Terabit": (37, 4),
"TB TiB Tebibyte Terabyte": (40, 4),
"Pb Pib Pebibit Petabit": (47, 5),
"PB PiB Pebibyte Petabyte": (50, 5),
"Eb Eib Exbibit Exabit": (57, 6),
"EB EiB Exbibyte Exabyte": (60, 6),
"Zb Zib Zebibit Zettabit": (67, 7),
"ZB ZiB Zebibyte Zettabyte": (70, 7),
"Yb Yib Yobibit Yottabit": (77, 8),
"YB YiB Yobibyte Yottabyte": (80, 8),
}
key_list = '\n'.join([" b Bit"] + [x for x in reference.keys()]) +"\n"
if unitN not in key_list:
raise IndexError(f"\n\nConversion unit must be one of:\n\n{key_list}")
units, divisors = [(k,v) for k,v in reference.items() if unitN in k][0]
if SI:
divisor = 1000**divisors[1]/8 if "bit" in units else 1000**divisors[1]
else:
divisor = float(1 << divisors[0])
value = bytes / divisor
return f"{value:,.0f} {unitN}{(value != 1 and len(unitN) > 3)*'s'}"
# Tests
>>> assert format_bytes(1,"b") == '8 b'
>>> assert format_bytes(1,"bits") == '8 bits'
>>> assert format_bytes(1024, "kilobyte") == "1 Kilobyte"
>>> assert format_bytes(1024, "kB") == "1 KB"
>>> assert format_bytes(7141000, "mb") == '54 Mb'
>>> assert format_bytes(7141000, "mib") == '54 Mib'
>>> assert format_bytes(7141000, "Mb") == '54 Mb'
>>> assert format_bytes(7141000, "MB") == '7 MB'
>>> assert format_bytes(7141000, "mebibytes") == '7 Mebibytes'
>>> assert format_bytes(7141000, "gb") == '0 Gb'
>>> assert format_bytes(1000000, "kB") == '977 KB'
>>> assert format_bytes(1000000, "kB", SI=True) == '1,000 KB'
>>> assert format_bytes(1000000, "kb") == '7,812 Kb'
>>> assert format_bytes(1000000, "kb", SI=True) == '8,000 Kb'
>>> assert format_bytes(125000, "kb") == '977 Kb'
>>> assert format_bytes(125000, "kb", SI=True) == '1,000 Kb'
>>> assert format_bytes(125*1024, "kb") == '1,000 Kb'
>>> assert format_bytes(125*1024, "kb", SI=True) == '1,024 Kb'
Here is the compact function to calculate size
def GetHumanReadable(size,precision=2):
suffixes=['B','KB','MB','GB','TB']
suffixIndex = 0
while size > 1024 and suffixIndex < 4:
suffixIndex += 1 #increment the index of the suffix
size = size/1024.0 #apply the division
return "%.*f%s"%(precision,size,suffixes[suffixIndex])
For more detailed output and vice versa operation please refer: http://code.activestate.com/recipes/578019-bytes-to-human-human-to-bytes-converter/