This code was downloaded from MOBZystems, Home of Tools. No license, use at will. And at your own risk!
using System; using System.Collections.Generic; using System.Text; using System.IO; namespace MOBZystems.Text { ///<summary> /// EncodingDetector. Detects the Encoding used in byte arrays /// or files by testing the start of the file for a Byte Order Mark /// (called 'preamble' in .NET). /// /// Use ReadAllText() to read a file using a detected encoding. /// /// All encodings that have a preamble are supported. ///</summary> public class EncodingDetector { ///<summary> /// Helper class to store information about encodings /// with a preamble ///</summary> protected class PreambleInfo { protected Encoding _encoding; protected byte[] _preamble; ///<summary> /// Property Encoding (Encoding). ///</summary> public Encoding Encoding { get { return this._encoding; } } ///<summary> /// Property Preamble (byte[]). ///</summary> public byte[] Preamble { get { return this._preamble; } } ///<summary> /// Constructor with preamble and encoding ///</summary> ///<param name="encoding"></param> ///<param name="preamble"></param> public PreambleInfo(Encoding encoding, byte[] preamble) { this._encoding = encoding; this._preamble = preamble; } } // The list of encodings with a preamble, // sorted longest preamble first. protected static SortedList<int, PreambleInfo> _preambles = null; // Maximum length of all preamles protected static int _maxPreambleLength = 0; ///<summary> /// Read the contents of a text file as a string. Scan for a preamble first. /// If a preamble is found, the corresponding encoding is used. /// If no preamble is found, the supplied defaultEncoding is used. ///</summary> ///<param name="filename">The name of the file to read</param> ///<param name="defaultEncoding">The encoding to use if no preamble present</param> ///<param name="usedEncoding">The actual encoding used</param> ///<returns>The contents of the file as a string</returns> public static string ReadAllText(string filename, Encoding defaultEncoding, out Encoding usedEncoding) { // Read the contents of the file as an array of bytes byte[] bytes = File.ReadAllBytes(filename); // Detect the encoding of the file: usedEncoding = DetectEncoding(bytes); // If none found, use the default encoding. // Otherwise, determine the length of the encoding markers in the file int offset; if (usedEncoding == null) { offset = 0; usedEncoding = defaultEncoding; } else { offset = usedEncoding.GetPreamble().Length; } // Now interpret the bytes according to the encoding, // skipping the preample (if any) return usedEncoding.GetString(bytes, offset, bytes.Length - offset); } ///<summary> /// Detect the encoding in an array of bytes. ///</summary> ///<param name="bytes"></param> ///<returns>The encoding found, or null</returns> public static Encoding DetectEncoding(byte[] bytes) { // Scan for encodings if we haven't done so if (_preambles == null) ScanEncodings(); // Try each preamble in turn foreach (PreambleInfo info in _preambles.Values) { // Match all bytes in the preamble bool match = true; if (bytes.Length >= info.Preamble.Length) { for (int i = 0; i < info.Preamble.Length; i++) { if (bytes[i] != info.Preamble[i]) { match = false; break; } } if (match) { return info.Encoding; } } } return null; } ///<summary> /// Detect the encoding of a file. Reads just enough of /// the file to be able to detect a preamble. ///</summary> ///<param name="filename">The path name of the file</param> ///<returns>The encoding detected, or null if no preamble found</returns> public static Encoding DetectEncoding(string filename) { // Scan for encodings if we haven't done so if (_preambles == null) ScanEncodings(); using (FileStream stream = File.OpenRead(filename)) { // Never read more than the length of the file // or the maximum preamble length long n = stream.Length; // No bytes? No encoding! if (n == 0) return null; // Read the minimum amount necessary if (n > _maxPreambleLength) n = _maxPreambleLength; byte[] bytes = new byte[n]; stream.Read(bytes, 0, (int)n); // Detect the encoding from the byte array return DetectEncoding(bytes); } } ///<summary> /// Loop over all available encodings and store those /// with a preamble in the _preambles list. /// The list is sorted by preamble length, /// longest preamble first. This prevents /// a short preamble 'masking' a longer one /// later in the list. ///</summary> protected static void ScanEncodings() { // Create a new sorted list of preambles _preambles = new SortedList<int, PreambleInfo>(); // Loop over all encodings foreach (EncodingInfo encodingInfo in Encoding.GetEncodings()) { // Do we have a preamble? byte[] preamble = encodingInfo.GetEncoding().GetPreamble(); if (preamble.Length > 0) { // Add it to the collection, inversely sorted by preamble length // (and code page, to keep the keys unique) _preambles.Add(-(preamble.Length * 1000000 + encodingInfo.CodePage), new PreambleInfo(encodingInfo.GetEncoding(), preamble)); // Update the maximum preamble length if this one's longer if (preamble.Length > _maxPreambleLength) { _maxPreambleLength = preamble.Length; } } } } } }