About character encoding and how to deal with unicode in the program, I want to write an article to summarize and study, but I find that what has been discussed by predecessors is very perfect, no longer repeated, you can refer to: http://www.regexlab.com/zh/encoding.htm.
In C + +, characters can be divided into char and wchar. Correspondingly, strings can be divided into string and wstring. In C, string is a unicode string, and each char is 16 bits.
The string constants in the source file will be automatically converted to unicode encoding (utf16). With Text.Encoding, the conversion between different encodings can be realized.
- using System;
- using System.Text;
- namespace test
- {
- class Program
- {
- static void Main(string[] args)
- {
- string u16s = "Forget it abc"; //The default character encoding is unicode, or utf16
- //4 codes
- Encoding utf8 = Encoding.UTF8;
- Encoding utf16 = Encoding.Unicode;
- Encoding gb = Encoding.GetEncoding("gbk");
- Encoding b5 = Encoding.GetEncoding("big5");
- //Convert to four encoded byte streams
- byte[] u16bytes = utf16.GetBytes(u16s);
- byte[] u8bytes = Encoding.Convert(utf16, utf8, u16bytes);
- byte[] gbytes = Encoding.Convert(utf16, gb, u16bytes);
- byte[] bbytes = Encoding.Convert(utf16, b5, u16bytes);
- Console.Write("unicode: ");
- foreach (byte c in u16bytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- Console.Write("utf8: ");
- foreach(byte c in u8bytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- Console.Write("gbk: ");
- foreach (byte c in gbytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- Console.Write("big5: ");
- foreach (byte c in bbytes)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- //Get four encoded string s
- string u8s = utf8.GetString(u8bytes);
- string gs = gb.GetString(gbytes);
- string bs = b5.GetString(bbytes);
- Console.WriteLine("unicode: " + u16s + " " + u16s.Length.ToString());
- Console.WriteLine("utf8: " + u8s + " " + u16s.Length.ToString());
- Console.WriteLine("gbk: " + gs + " " + gs.Length.ToString());
- Console.WriteLine("big5: " + bs + " " + bs.Length.ToString());
- Console.Write("unicode: ");
- foreach (char c in u16s)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- Console.Write("utf8: ");
- foreach (char c in u8s)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- Console.Write("gb2312: ");
- foreach (char c in gs)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- Console.Write("big5: ");
- foreach (char c in bs)
- {
- Console.Write(((int)c).ToString("x") + " ");
- }
- Console.WriteLine();
- Console.ReadKey();
- }
- }
- }
//Output of the above program:
//Here are four types of encoded byte strings
unicode: d8 5f 18 8a 86 4e 4a 55 61 0 62 0 63 0
utf8: e5 bf 98 e8 a8 98 e4 ba 86 e5 95 8a 61 62 63
gbk: cd fc d3 9b c1 cb b0 a1 61 62 63
big5: a7 d1 b0 4f a4 46 b0 da 61 62 63
//Four strings from GetString() method
unicode: forget abc 7
utf8: forget abc 7
gbk: forget abc 7
big5: forget abc 7
//Character encoding in four string s
unicode: 5fd8 8a18 4e86 554a 61 62 63
utf8: 5fd8 8a18 4e86 554a 61 62 63
gb2312: 5fd8 8a18 4e86 554a 61 62 63
big5: 5fd8 8a18 4e86 554a 61 62 63
As you can see, using the GetString method, the strings obtained are all unicode encoded, that is to say, its function is to "decode" various encoded byte arrays into a unicode string.