Reading bytes from a JavaScript string

I have a string containing binary data in JavaScript. Now I want to read, for example, an integer from it. So I get the first 4 characters, use charCodeAt, do some shifting, etc. to get an integer.

The problem is that strings in JavaScript are UTF-16 (instead of ASCII) and charCodeAt often returns values higher than 256.

The Mozilla reference states that "The first 128 Unicode code points are a direct match of the ASCII character encoding." (what about ASCII values > 128?).

How can I convert the result of charCodeAt to an ASCII value? Or is there a better way to convert a string of four characters to a 4 byte integer?


Solution 1:

I believe that you can can do this with relatively simple bit operations:

function stringToBytes ( str ) {
  var ch, st, re = [];
  for (var i = 0; i < str.length; i++ ) {
    ch = str.charCodeAt(i);  // get char 
    st = [];                 // set up "stack"
    do {
      st.push( ch & 0xFF );  // push byte to stack
      ch = ch >> 8;          // shift value down by 1 byte
    }  
    while ( ch );
    // add stack contents to result
    // done because chars have "wrong" endianness
    re = re.concat( st.reverse() );
  }
  // return an array of bytes
  return re;
}

stringToBytes( "A\u1242B\u4123C" );  // [65, 18, 66, 66, 65, 35, 67]

It should be a simple matter to sum the output up by reading the byte array as if it were memory and adding it up into larger numbers:

function getIntAt ( arr, offs ) {
  return (arr[offs+0] << 24) +
         (arr[offs+1] << 16) +
         (arr[offs+2] << 8) +
          arr[offs+3];
}

function getWordAt ( arr, offs ) {
  return (arr[offs+0] << 8) +
          arr[offs+1];
}

'\\u' + getWordAt( stringToBytes( "A\u1242" ), 1 ).toString(16);  // "1242"

Solution 2:

Borgar's answer seems correct.

Just wanted to clarify one point. Javascript treats bitwise operations as '32-bit signed int's, where the last (left-most) bit is the sign bit. Ie,

getIntAt([0x7f,0,0,0],0).toString(16)  //  "7f000000"

getIntAt([0x80,0,0,0],0).toString(16)  // "-80000000"

However, for octet-data processing (eg, network stream, etc), usually want the 'unsigned int' representation. This can be accomplished by adding a '>>> 0' (zero-fill right-shift) operator which internally tells Javascript to treat this as unsigned.

function getUIntAt ( arr, offs ) {
  return (arr[offs+0] << 24) +
         (arr[offs+1] << 16) +
         (arr[offs+2] << 8) +
          arr[offs+3] >>> 0;
}

getUIntAt([0x80,0,0,0],0).toString(16)   // "80000000"

Solution 3:

There are two methods for encoding and decoding utf-8 string to a byte array and back.

var utf8 = {}

utf8.toByteArray = function(str) {
    var byteArray = [];
    for (var i = 0; i < str.length; i++)
        if (str.charCodeAt(i) <= 0x7F)
            byteArray.push(str.charCodeAt(i));
        else {
            var h = encodeURIComponent(str.charAt(i)).substr(1).split('%');
            for (var j = 0; j < h.length; j++)
                byteArray.push(parseInt(h[j], 16));
        }
    return byteArray;
};

utf8.parse = function(byteArray) {
    var str = '';
    for (var i = 0; i < byteArray.length; i++)
        str +=  byteArray[i] <= 0x7F?
                byteArray[i] === 0x25 ? "%25" : // %
                String.fromCharCode(byteArray[i]) :
                "%" + byteArray[i].toString(16).toUpperCase();
    return decodeURIComponent(str);
};

// sample
var str = "Да!";
var ba = utf8.toByteArray(str);
alert(ba);             // 208, 148, 208, 176, 33
alert(ba.length);      // 5
alert(utf8.parse(ba)); // Да!