* -------------------------------------------------------------------------
* This file is part of the MindStudio project.
* Copyright (c) 2025 Huawei Technologies Co.,Ltd.
*
* MindStudio is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*/
#include "pch.h"
#include "FileParser.h"
#include "JsonFileProcess.h"
namespace Dic {
namespace Module {
std::vector<std::pair<int64_t, int64_t>> JsonFileProcess::SplitFile(
const std::string &filePath, std::optional<std::pair<int64_t, int64_t>> position) {
std::ifstream file = OpenReadFileSafely(filePath, std::ios::in | std::ios::binary);
if (!file.is_open()) {
Dic::Server::ServerLog::Error("Split file failed to open json file. ");
return {};
}
std::pair<int64_t, int64_t> pos;
if (position.has_value()) {
pos = std::make_pair(position.value().first, position.value().second);
} else {
int64_t start = 0;
file.seekg(0, std::ifstream::end);
int64_t end = file.tellg();
pos = std::make_pair(start, end);
}
std::vector<std::pair<int64_t, int64_t>> result = GetSplitPosition(file, pos);
file.close();
return result;
}
std::vector<std::pair<int64_t, int64_t>> JsonFileProcess::GetSplitPosition(
std::ifstream &file, std::pair<int64_t, int64_t> position) {
file.seekg(0, std::ifstream::end);
int64_t fileSize = file.tellg();
int64_t contentStart = position.first;
int64_t contentEnd = position.second;
if (contentStart < 0 || contentEnd < 0 || contentStart >= contentEnd || contentEnd > fileSize) {
Dic::Server::ServerLog::Error("Invalid position to split file, start position is % and end position is %.",
position.first, position.second);
return {};
}
std::vector<std::pair<int64_t, int64_t>> result = {};
int64_t contentSize = contentEnd - contentStart;
file.clear();
file.seekg(contentStart, std::ios::beg);
Dic::Module::JsonFormat json = SeekRegexPosition(file, R"(\"traceEvents")")
? Dic::Module::JsonFormat::JSON_OBJECT_FORMAT
: Dic::Module::JsonFormat::JSON_ARRAY_FORMAT;
if (contentSize <= blockSize) {
ComputeSmallFilePosition(file, result, json, position);
return result;
}
bool endFlag = false;
while (!endFlag) {
if (!SeekCharPosition(file, '{')) {
Dic::Server::ServerLog::Info("Failed to find json format start position.");
break;
}
int64_t start = file.tellg();
std::string endRegex;
if (start + blockSize >= contentSize) {
file.seekg(contentEnd - endBufferLength, std::ifstream::beg);
endFlag = true;
} else {
file.seekg(blockSize, std::ifstream::cur);
}
if (!SeekPhEndPosition(file, endFlag, endBufferLength)) {
Dic::Server::ServerLog::Error("Failed to find ph json format.");
break;
}
int64_t end = file.tellg();
result.emplace_back(start, end);
}
return result;
}
void JsonFileProcess::ComputeSmallFilePosition(std::ifstream &file, std::vector<std::pair<int64_t, int64_t>> &result,
const JsonFormat &json, std::pair<int64_t, int64_t> position) {
if (json == JsonFormat::JSON_OBJECT_FORMAT) {
int64_t contentStart = position.first;
int64_t contentEnd = position.second;
int64_t contentSize = contentEnd - contentStart;
if (!SeekRegexPosition(file, R"(\[\s*\{)")) {
Server::ServerLog::Warn("Failed to find start position of json object format.");
return;
}
int64_t start = file.tellg();
if (contentSize < endBufferLength) {
file.seekg(contentStart, std::ifstream::beg);
} else {
file.seekg(contentEnd - endBufferLength, std::ifstream ::beg);
}
const int bufferLength = contentSize < endBufferLength ? contentSize : endBufferLength;
if (SeekPhEndPosition(file, true, bufferLength)) {
int64_t end = file.tellg();
if (start > INT64_MAX - 1 || start + 1 > end) {
Server::ServerLog::Warn("Failed to find legal end position of json object format.");
return;
}
result.emplace_back(start + 1, end);
}
} else {
result.emplace_back(0, 0);
}
}
bool JsonFileProcess::SeekCharPosition(std::ifstream &file, char c) {
file.clear();
auto cur = file.tellg();
std::unique_ptr<char[]> buffer = std::make_unique<char[]>(startBufferLength);
file.read(buffer.get(), startBufferLength);
int64_t readCount = file.gcount();
if (readCount <= 0) {
Dic::Server::ServerLog::Error("Seek char. Failed to read file.");
return false;
}
file.clear();
file.seekg(cur);
std::string str(buffer.get(), readCount);
size_t offset = str.find(c);
if (offset == std::string::npos) {
Dic::Server::ServerLog::Error("Failed to find separator.");
return false;
}
file.seekg(static_cast<int64_t>(offset), std::ifstream::cur);
return true;
}
bool JsonFileProcess::SeekRegexPosition(std::ifstream &file, const std::string ®ex) {
file.clear();
auto cur = file.tellg();
std::unique_ptr<char[]> buffer = std::make_unique<char[]>(endBufferLength);
file.read(buffer.get(), endBufferLength);
int64_t readCount = file.gcount();
if (readCount <= 0) {
Dic::Server::ServerLog::Error("Seek regex. Failed to read file.");
return false;
}
file.clear();
file.seekg(cur);
std::string str(buffer.get(), readCount);
auto result = Dic::RegexUtil::RegexSearch(str, regex);
if (!result.has_value()) {
Dic::Server::ServerLog::Warn("Can't find match regex:", regex);
return false;
}
file.seekg(result.value().position(), std::ifstream::cur);
return true;
}
bool JsonFileProcess::SeekPhEndPosition(std::ifstream &file, bool endFlag, int bufferLength) {
file.clear();
auto cur = file.tellg();
std::unique_ptr<char[]> buffer = std::make_unique<char[]>(bufferLength);
file.read(buffer.get(), bufferLength);
int64_t readCount = file.gcount();
if (readCount <= 0) {
Dic::Server::ServerLog::Error("Seek ph end position. Failed to read file.");
return false;
}
file.clear();
file.seekg(cur);
std::string str(buffer.get(), readCount);
size_t offset = std::string::npos;
if (endFlag) {
offset = str.rfind("\"ph\"");
} else {
offset = str.find("\"ph\"");
}
if (offset == std::string::npos || offset > INT_MAX || str.size() > INT_MAX) {
Dic::Server::ServerLog::Error("Failed to find ph.");
return false;
}
int strLen = static_cast<int>(str.size());
int iOffset = static_cast<int>(offset);
int pos = -1;
int leftCount = 0;
for (int i = iOffset; i < strLen; ++i) {
if (str[i] == '{') {
leftCount++;
continue;
}
if (str[i] == '}' && leftCount == 0) {
pos = i;
break;
}
if (str[i] == '}') {
leftCount--;
}
}
if (pos == -1) {
Dic::Server::ServerLog::Error("Failed to find ph end position.");
return false;
}
file.seekg(pos, std::ifstream::cur);
return true;
}
}
}