Facebook JSON badly encoded
Solution 1:
I can indeed confirm that the Facebook download data is incorrectly encoded; a Mojibake. The original data is UTF-8 encoded but was decoded as Latin -1 instead. I’ll make sure to file a bug report.
In the meantime, you can repair the damage in two ways:
-
Decode the data as JSON, then re-encode any strings as Latin-1, decode again as UTF-8:
>>> import json >>> data = r'"Rados\u00c5\u0082aw"' >>> json.loads(data).encode('latin1').decode('utf8') 'Radosław'
-
Load the data as binary, replace all
\u00hh
sequences with the byte the last two hex digits represent, decode as UTF-8 and then decode as JSON:import re from functools import partial fix_mojibake_escapes = partial( re.compile(rb'\\u00([\da-f]{2})').sub, lambda m: bytes.fromhex(m.group(1).decode())) with open(os.path.join(subdir, file), 'rb') as binary_data: repaired = fix_mojibake_escapes(binary_data.read()) data = json.loads(repaired.decode('utf8'))
From your sample data this produces:
{'content': 'No to trzeba ostatnie treningi zrobić xD', 'sender_name': 'Radosław', 'timestamp': 1524558089, 'type': 'Generic'}
Solution 2:
Here is a command-line solution with jq and iconv. Tested on Linux.
cat message_1.json | jq . | iconv -f utf8 -t latin1 > m1.json
Solution 3:
My solution for parsing objects use parse_hook
callback on load/loads function:
import json
def parse_obj(dct):
for key in dct:
dct[key] = dct[key].encode('latin_1').decode('utf-8')
pass
return dct
data = '{"msg": "Ahoj sv\u00c4\u009bte"}'
# String
json.loads(data)
# Out: {'msg': 'Ahoj svÄ\x9bte'}
json.loads(data, object_hook=parse_obj)
# Out: {'msg': 'Ahoj světe'}
# File
with open('/path/to/file.json') as f:
json.load(f, object_hook=parse_obj)
# Out: {'msg': 'Ahoj světe'}
pass
Update:
Solution for parsing list with strings does not working. So here is updated solution:
import json
def parse_obj(obj):
for key in obj:
if isinstance(obj[key], str):
obj[key] = obj[key].encode('latin_1').decode('utf-8')
elif isinstance(obj[key], list):
obj[key] = list(map(lambda x: x if type(x) != str else x.encode('latin_1').decode('utf-8'), obj[key]))
pass
return obj
Solution 4:
I would like to extend @Geekmoss' answer with the following recursive code snippet, I used to decode my facebook data.
import json
def parse_obj(obj):
if isinstance(obj, str):
return obj.encode('latin_1').decode('utf-8')
if isinstance(obj, list):
return [parse_obj(o) for o in obj]
if isinstance(obj, dict):
return {key: parse_obj(item) for key, item in obj.items()}
return obj
decoded_data = parse_obj(json.loads(file))
I noticed this works better, because the facebook data you download might contain list of dicts, in which case those dicts would be just returned 'as is' because of the lambda identity function.
Solution 5:
Based on @Martijn Pieters solution, I wrote something similar in Java.
public String getMessengerJson(Path path) throws IOException {
String badlyEncoded = Files.readString(path, StandardCharsets.UTF_8);
String unescaped = unescapeMessenger(badlyEncoded);
byte[] bytes = unescaped.getBytes(StandardCharsets.ISO_8859_1);
String fixed = new String(bytes, StandardCharsets.UTF_8);
return fixed;
}
The unescape method is inspired by the org.apache.commons.lang.StringEscapeUtils.
private String unescapeMessenger(String str) {
if (str == null) {
return null;
}
try {
StringWriter writer = new StringWriter(str.length());
unescapeMessenger(writer, str);
return writer.toString();
} catch (IOException ioe) {
// this should never ever happen while writing to a StringWriter
throw new UnhandledException(ioe);
}
}
private void unescapeMessenger(Writer out, String str) throws IOException {
if (out == null) {
throw new IllegalArgumentException("The Writer must not be null");
}
if (str == null) {
return;
}
int sz = str.length();
StrBuilder unicode = new StrBuilder(4);
boolean hadSlash = false;
boolean inUnicode = false;
for (int i = 0; i < sz; i++) {
char ch = str.charAt(i);
if (inUnicode) {
unicode.append(ch);
if (unicode.length() == 4) {
// unicode now contains the four hex digits
// which represents our unicode character
try {
int value = Integer.parseInt(unicode.toString(), 16);
out.write((char) value);
unicode.setLength(0);
inUnicode = false;
hadSlash = false;
} catch (NumberFormatException nfe) {
throw new NestableRuntimeException("Unable to parse unicode value: " + unicode, nfe);
}
}
continue;
}
if (hadSlash) {
hadSlash = false;
if (ch == 'u') {
inUnicode = true;
} else {
out.write("\\");
out.write(ch);
}
continue;
} else if (ch == '\\') {
hadSlash = true;
continue;
}
out.write(ch);
}
if (hadSlash) {
// then we're in the weird case of a \ at the end of the
// string, let's output it anyway.
out.write('\\');
}
}