//----------------------------------------------------------------------------------------- // BinToUTF8 kind-of-C++-styled pseudocode by Robbi-985 (2016-08-18) // //For the sake of keeping this pseudocode simple, no I/O buffering is included. //Implementing as-is will be very slow. It is faster to read and write files in big chunks. //Shared variables: //BCL = Byte/Character Lookup table: int BCLHighestUsed = -1; //Last index that has been used in the arrays so far. byte BCLByte[0 to 255]; //Will contain byte values (initial values undefined). charcode BCLCharCode[0 to 255]; //UTF-8 character codes (values are your choice). //^ For the sake of keeping this pseudocode simple, I will assume that each index in // here contains a character code (integer), rather than a byte sequence for a // UTF-8-encoded character. Exactly how you choose to store them, and exactly which // character codes you choose to fill it with, are up to you. The lower indeces of the // lookup table are used preferentially, so I choose to use ASCII characters in the // lower indeces and Chinese characters when I run out of ASCII characters, like so: // Indeces 0 to 91: Character codes 0x23 to 0x7E (ASCII). // Indeces 92 to 255: Character codes 0x3900 to 0x39A3 (Chinese). // Remember that ASCII characters are already valid UTF-8 encodings. void ConvBinToUTF8() { BCLInit(); //Prepare the lookup table. InputFile.Open(); //Any binary file, e.g. raw 8-bit PCM data. OutputFile.Open(); //This will become a UTF-8-encoded text file. while (!InputFile.EndOfFile) { OutputFile.WriteBytes(EncodeUTF8Char(ByteToCharCode(InputFile.ReadByte))); //^ EncodeUTF8Char() will obviously vary depending on OS and language. I used the // Windows API call like so (simplified; see MSDN for more information): //WideCharToMultiByte(CP_UTF8, 0, NewCharCode.Ptr, 1, OutBuffer.Ptr, OutLength, 0, 0); } OutputFile.Close(); InputFile.Close(); SaveBCL(); //Simply saves contents of our lookup table, i.e. BCLByte[] //and BCLCharCode[], to a separate ".bcl" file. //We will need this to be able to recreate the input file. } void BCLInit() { BCLHighestUsed = -1; //This is where you set up BCLCharCode[], i.e. the character codes in the lookup table. //Values are your choice, but I did it this way: int LoopIndex; //Add ASCII characters: #.../ 0...9 :...@ A...Z [...` a...z {...~ for (LoopIndex = 0x23; LoopIndex <= 0x7E; LoopIndex ++) { BCLInitSingle(charcode(LoopIndex)); } //Add Chinese Unicode characters... LoopIndex = 0; while (BCLHighestUsed < 255) { BCLInitSingle(charcode(0x3900 + LoopIndex)); LoopIndex ++; } BCLHighestUsed = -1; } void BCLInitSingle(charcode NewCharCode) { BCLHighestUsed ++; BCLCharCode[BCLHighestUsed] = NewCharCode; } charcode ByteToCharCode(byte ByteVal) { //Search for matching byte in lookup table and return the character code. for (int CheckIndex = 0; CheckIndex <= BCLHighestUsed; CheckIndex ++) { if (BCLByte[CheckIndex] == ByteVal) {return BCLCharCode[CheckIndex];} } //None existed - add new entry for this byte, using next available character code. if (BCLHighestUsed < 255) { BCLHighestUsed ++; BCLByte[BCLHighestUsed] = ByteVal; return BCLCharCode[BCLHighestUsed]; } else { //There must be a bug somewhere - all 256 lookup entries are already used. } } void ConvUTF8ToBin() { LoadBCL(); //Load previous lookup table arrays from ".bcl" file. InputFile.Open(); //A UTF-8-encoded text file. AllUTF8[] = DecodeUTF8Bytes(InputFile.ReadAllBytes()); //^ This ends up as an array of UTF-8 character codes, some of which will be in the ASCII // range and some of which will be Chinese characters. As before, DecodeUTF8Bytes() will // vary depending on OS and language. I used the MultiByteToWideChar() API on Windows. InputFile.Close(); OutputFile.Open(); //This will become a recreated file, e.g. raw 8-bit PCM data. for (int CharIndex = 0; CharIndex < AllUTF8[].Length; CharIndex ++) { OutputFile.WriteByte(BCLCharCodeToByte(AllUTF8[CharIndex])); } OutputFile.Close(); } byte BCLCharCodeToByte(charcode CharCodeVal) { //Search for matching character code in lookup table and return the byte. for (int CheckIndex = 0; CheckIndex <= 255; CheckIndex ++) { if (BCLCharCode[CheckIndex] == CharCodeVal) { return BCLByte[CheckIndex]; } } //IMPORTANT: Torch-rnn will write CR+LF at the end of its output, making text files 2 //bytes longer than you requested of it. As CR and LF are not characters in our //lookup table (BCLCharCode[]), we will not have returned a value yet! You can choose //your own default value to return in this case here: return 0; //Not included in this pseudocode: I decided to keep track of the total number of these //mismatches and display an error at the end of conversion according to how many failed //(i.e. nothing serious if only 2 failed). }