/*
* Copyright (c) Cangjie Library Team 2022-2022. All rights reserved.
*/
/**
* @file
* Read gbk encoded file stream
*/
package csv4cj
import std.fs.*
/**
* The file is parsed using the gbk encoding format.
* The caller needs to ensure the correct encoding format of the file,
* otherwise the expected parsing result may not be obtained
* @author LilFlameZ
* @since 1.0.4
*/
public class GBKReaderStream <: CharReader {
let fileStream: BufferedReader
/**
* The Function is init constructor
* The caller needs to ensure the correct encoding format of the file,
* otherwise the expected parsing result may not be obtained
* @param s of FileStream
* @since 1.0.4
*/
public init(s: File) {
fileStream = BufferedReader(s)
}
//"look" Private buffer
private let readAHeadBuf: Array<UInt8> = [0]
// Read a dedicated buffer of one byte
private let readBuf: Array<UInt8> = [0]
// Read a character from the stream
/**
* The Function is read
*
* @return Type of Option<Rune>
* @since 1.0.4
*/
public func read(): Option<Rune> {
var result = Option<Rune>.None
if (let Some(readByte) <- fileStream.read()) {
// Check if it's a double-byte character
if (isDoubleByteChar(readByte)) {
if (let Some(readByte1) <- fileStream.read()) {
// Combine two bytes to form a Unicode code point
let codePoint = convertGBKToUnicode(readByte, readByte1)
result = Rune(codePoint)
}
} else {
// Single byte character
result = Rune(readByte)
}
}
return result
}
// Check if the byte indicates a double-byte character
private func isDoubleByteChar(byte: UInt8): Bool {
// In GBK, characters with the first byte >= 0x81 and <= 0xFE are double-byte
// ASCII characters (0x00-0x7F) are single-byte
return byte >= 0x81 && byte <= 0xFE
}
// Convert GBK double-byte to Unicode code point
private func convertGBKToUnicode(byte1: UInt8, byte2: UInt8): UInt32 {
// This is a simplified conversion
// In a real implementation, you would need a mapping table or library
// to convert GBK to Unicode properly
// For now, we'll just combine the bytes as a placeholder
let codePoint = (UInt32(byte1) << 8) + UInt32(byte2)
try {
return gbk2unicode[codePoint]
} catch (e: NoneValueException) {
throw Exception("GBK character not found in mapping table")
}
}
/**
* Look at a few characters
* Read the size characters of buf from the current position of the stream, and then restore the position of the stream to the position before reading,
* It is equivalent to not moving the current position of the stream, but just glancing at the contents of the next few characters
*
* @param buf of Array<Rune>
*
* @return Type of Int64
* @since 1.0.4
*/
public func lookAhead(buf: Array<Rune>): Int64 {
if (buf.size == 0) {
return 0
}
fileStream.mark(buf.size * 2) // GBK characters can be up to 2 bytes
// Read quantity
var currentChrCount = 0
do {
if (let Some(chr) <- lookAhead()) {
buf[currentChrCount] = chr
currentChrCount++
// Consume the character from the stream
_ = read()
} else {
break
}
} while (currentChrCount < buf.size)
fileStream.reset()
return currentChrCount
}
/**
* Gets the next character without changing the current position of the stream
*
* @return Type of Option<Rune>
* @since 1.0.4
*/
public func lookAhead(): Option<Rune> {
fileStream.mark(2) // GBK characters can be up to 2 bytes
let result = read()
fileStream.reset()
return result
}
}