Characters in string changed after downloading HTML from the internet
Using the following code, I can download the HTML of a file from the internet:
WebClient wc = new WebClient();
// ....
string downloadedFile = wc.DownloadString("http://www.myurl.com/");
However, sometimes the file contains "interesting" characters like é
to é
, ←
to â†
and フシギダネ
to フシギダãƒ
.
I think it may be something to do with different unicode types or something, as each character gets changed into 2 new ones, perhaps each character being split in half but I have very little knowledge in this area. What do you think is wrong?
Solution 1:
Here's a wrapped download class which supports gzip and checks encoding header and meta tags in order to decode it correctly.
Instantiate the class, and call GetPage()
.
public class HttpDownloader
{
private readonly string _referer;
private readonly string _userAgent;
public Encoding Encoding { get; set; }
public WebHeaderCollection Headers { get; set; }
public Uri Url { get; set; }
public HttpDownloader(string url, string referer, string userAgent)
{
Encoding = Encoding.GetEncoding("ISO-8859-1");
Url = new Uri(url); // verify the uri
_userAgent = userAgent;
_referer = referer;
}
public string GetPage()
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
if (!string.IsNullOrEmpty(_referer))
request.Referer = _referer;
if (!string.IsNullOrEmpty(_userAgent))
request.UserAgent = _userAgent;
request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
Headers = response.Headers;
Url = response.ResponseUri;
return ProcessContent(response);
}
}
private string ProcessContent(HttpWebResponse response)
{
SetEncodingFromHeader(response);
Stream s = response.GetResponseStream();
if (response.ContentEncoding.ToLower().Contains("gzip"))
s = new GZipStream(s, CompressionMode.Decompress);
else if (response.ContentEncoding.ToLower().Contains("deflate"))
s = new DeflateStream(s, CompressionMode.Decompress);
MemoryStream memStream = new MemoryStream();
int bytesRead;
byte[] buffer = new byte[0x1000];
for (bytesRead = s.Read(buffer, 0, buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer, 0, buffer.Length))
{
memStream.Write(buffer, 0, bytesRead);
}
s.Close();
string html;
memStream.Position = 0;
using (StreamReader r = new StreamReader(memStream, Encoding))
{
html = r.ReadToEnd().Trim();
html = CheckMetaCharSetAndReEncode(memStream, html);
}
return html;
}
private void SetEncodingFromHeader(HttpWebResponse response)
{
string charset = null;
if (string.IsNullOrEmpty(response.CharacterSet))
{
Match m = Regex.Match(response.ContentType, @";\s*charset\s*=\s*(?<charset>.*)", RegexOptions.IgnoreCase);
if (m.Success)
{
charset = m.Groups["charset"].Value.Trim(new[] { '\'', '"' });
}
}
else
{
charset = response.CharacterSet;
}
if (!string.IsNullOrEmpty(charset))
{
try
{
Encoding = Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
}
}
}
private string CheckMetaCharSetAndReEncode(Stream memStream, string html)
{
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*""?(?<charset>[A-Za-z0-9_-]+)""?", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
if (m.Success)
{
string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1";
if ((charset == "unicode") || (charset == "utf-16"))
{
charset = "utf-8";
}
try
{
Encoding metaEncoding = Encoding.GetEncoding(charset);
if (Encoding != metaEncoding)
{
memStream.Position = 0L;
StreamReader recodeReader = new StreamReader(memStream, metaEncoding);
html = recodeReader.ReadToEnd().Trim();
recodeReader.Close();
}
}
catch (ArgumentException)
{
}
}
return html;
}
}
Solution 2:
Since I am not allowed to comment (insufficient reputation), I'll have to post an additional answer. I am using Mikael's great class routinely, but I encountered a practical problem with the regex that tries to find the charset meta-info. This
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
fails on this
<meta charset="UTF-8"/>
whereas this
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*""?(?<charset>[A-Za-z0-9_-]+)""?", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
does not.
Thanks, Mikael.