New XML System, first pass.

This commit is contained in:
2023-07-12 10:34:51 -07:00
parent 1c7abbf140
commit 19c6575aaf
9 changed files with 126 additions and 156 deletions

View File

@ -7,6 +7,12 @@
using namespace Dawn;
XmlNode::XmlNode() {
this->child = nullptr;
this->value.clear();
}
bool_t Xml::isWhitespace(char_t c) {
return c == ' ' || c == '\r' || c == '\n' || c == '\t';
}
@ -26,9 +32,8 @@ void Xml::load(Xml *xml, std::string data, size_t *j) {
bool_t insideTag = false;
std::string buffer = "";
std::string attrKey = "";
std::string bufferWhitespaces;
bool_t valueIsInWhitespace = false;
size_t i = *j;
struct XmlNode childNode;
while(c = data[i++]) {
if(insideTag) {
@ -39,26 +44,21 @@ void Xml::load(Xml *xml, std::string data, size_t *j) {
case XML_PARSE_STATE_DOING_NOTHING:
if(c == '>') continue;
if(c == '<') {
// Parsing comment?
if(data[i] == '!' && data[i+1] == '-' && data[i+2] == '-') {
doingBeforeComment = doing;
doing = XML_PARSE_STATE_PARSING_COMMENT;
i += 3;
} else if(data[i] == '!' && !insideTag) {
// Likely <!DOCTYPE ...>
while((c = data[i++]) != '>') {
// Nothing needs doing here right now, in future may support doctype
}
continue;
} else if(insideTag) {
if(data[i] == '/') {
i -= 1;
doing = XML_PARSE_STATE_PARSING_CHILD;
} else {
i -= 1;
auto child = new Xml();
Xml::load(child, data, &i);
xml->children.push_back(child);
doing = XML_PARSE_STATE_PARSING_CHILD;
// Remove last char since we kinda already parsed it.
xml->innerXml += child->outerXml;
xml->outerXml = xml->outerXml.substr(0, xml->outerXml.size()-1);
xml->outerXml += child->outerXml;
doing = XML_PARSE_STATE_PARSING_CLOSE;
continue;
}
} else {
doing = XML_PARSE_STATE_PARSING_TAG_NAME;
@ -69,8 +69,8 @@ void Xml::load(Xml *xml, std::string data, size_t *j) {
continue;
}
xml->innerXml += c;
if(Xml::isWhitespace(c)) continue;
if(insideTag) xml->innerXml += c;
if(Xml::isWhitespace(c)) continue;// NEEDS TO GO?
doing = XML_PARSE_STATE_PARSING_VALUE;
buffer += c;
break;
@ -86,7 +86,7 @@ void Xml::load(Xml *xml, std::string data, size_t *j) {
insideTag = false;
doing = XML_PARSE_STATE_PARSING_CLOSE;
} else {
doing = c == '>' ? XML_PARSE_STATE_DOING_NOTHING : XML_PARSE_STATE_LOOKING_FOR_ATTRIBUTE;
doing = c == '>' ? XML_PARSE_STATE_PARSING_VALUE : XML_PARSE_STATE_LOOKING_FOR_ATTRIBUTE;
}
continue;
}
@ -98,7 +98,7 @@ void Xml::load(Xml *xml, std::string data, size_t *j) {
// Look until we hit either the end of a tag, or the attribute itself
if(Xml::isWhitespace(c) || c == '>' || c == '/' || c == '=') {
if(c == '>' || c == '/') {
doing = XML_PARSE_STATE_DOING_NOTHING;
doing = XML_PARSE_STATE_PARSING_VALUE;
if(c == '/') {
level--;
insideTag = false;
@ -149,116 +149,86 @@ void Xml::load(Xml *xml, std::string data, size_t *j) {
case XML_PARSE_STATE_PARSING_VALUE:
// Keep parsing child until we find a < for an opening/closing tag.
if(c == '<' && !(data[i] == '<' || data[i-2] == '<')) {
if(buffer.size() > 0) {
childNode.nodeType = XML_NODE_TYPE_TEXT;
childNode.value = buffer;
xml->childNodes.push_back(childNode);
}
// Are we parsing the close tag, or parsing a child?
if(data[i] == '/') {
// In HTML Spec there could be a child here but not in XML spec.
doing = XML_PARSE_STATE_PARSING_CLOSE;
xml->value = buffer;
xml->textContent = buffer;
buffer.clear();
valueIsInWhitespace = false;
bufferWhitespaces.clear();
continue;
}
std::cout << "Detected unsupported use of a child within a node value, e.g. <div>Hello <b>world</b> how are you?</div>" << std::endl;
throw "Test";
continue;
}
xml->innerXml += c;
if(Xml::isWhitespace(c)) {
if(!valueIsInWhitespace) {
bufferWhitespaces.clear();
bufferWhitespaces += c;
valueIsInWhitespace = true;
} else {
if(c != ' ') bufferWhitespaces += c;
}
// TODO: I can maybe consider indentation here
} else {
if(valueIsInWhitespace) {
buffer += bufferWhitespaces;
valueIsInWhitespace = false;
}
if(c == '&') {
// Handle special characters. First read ahead to nearest semicolon OR
// nearest closing tag.
std::string sc;
while(c = data[i++]) {
xml->innerXml += c;
if(c == ';') break;
if(c == '<') assertUnreachable();//Invalid XML
sc += c;
}
if(valueIsInWhitespace) {
buffer += bufferWhitespaces;
valueIsInWhitespace = false;
}
if(sc == "lt") {
buffer += '<';
} else if(sc == "gt") {
buffer += '>';
} else if(sc == "amp") {
buffer += '&';
} else if(sc == "apos") {
buffer += '\'';
} else if(sc == "quot") {
buffer += '"';
} else if(sc == "nbsp") {
buffer += ' ';
} else {
// Try parse as integer
if(sc.size() > 1 && sc[0] == '#') {
int code = std::stoi(sc.substr(1));
buffer += (char)code;
} else {
std::cout << "Unknown Special character: " << sc << std::endl;
assertUnreachable();
}
}
} else {
buffer += c;
}
}
break;
case XML_PARSE_STATE_PARSING_CHILD:
if(c == '<') {
// Read ahead and confirm this is a close or not
if(data[i] == '/') {
doing = XML_PARSE_STATE_PARSING_CLOSE;
continue;
}
if(data[i] == '!' && data[i+1] == '-' && data[i+2] == '-') {
} else if(data[i] == '!' && data[i+1] == '-' && data[i+2] == '-') {
doingBeforeComment = doing;
doing = XML_PARSE_STATE_PARSING_COMMENT;
i += 3;
continue;
}
// Likely another child.
auto child = new Xml();
// Parsing child
i -= 1;
// @deprecated
auto child = new Xml();
Xml::load(child, data, &i);
xml->children.push_back(child);
childNode = XmlNode();
childNode.nodeType = XML_NODE_TYPE_ELEMENT;
childNode.child = child;
xml->childNodes.push_back(childNode);
// Remove last char since we kinda already parsed it.
xml->innerXml += child->outerXml;
xml->outerXml = xml->outerXml.substr(0, xml->outerXml.size()-1);
xml->outerXml += child->outerXml;
}
if(Xml::isWhitespace(c)) {
xml->innerXml += c;
buffer.clear();
continue;
}
// In HTML Spec there's a chance for there to be a value here, but not
// in the XML spec.
xml->innerXml += c;
if(c == '&') {
// Handle special characters. First read ahead to nearest semicolon OR
// nearest closing tag.
std::string sc;
while(c = data[i++]) {
xml->innerXml += c;
if(c == ';') break;
if(c == '<') assertUnreachable();//Invalid XML
sc += c;
}
if(sc == "lt") {
buffer += '<';
} else if(sc == "gt") {
buffer += '>';
} else if(sc == "amp") {
buffer += '&';
} else if(sc == "apos") {
buffer += '\'';
} else if(sc == "quot") {
buffer += '"';
} else if(sc == "nbsp") {
buffer += ' ';
} else {
// Try parse as integer
if(sc.size() > 1 && sc[0] == '#') {
int code = std::stoi(sc.substr(1));
buffer += (char)code;
} else {
std::cout << "Unknown Special character: " << sc << std::endl;
assertUnreachable();
}
}
} else {
buffer += c;
}
break;
case XML_PARSE_STATE_PARSING_CLOSE:

View File

@ -17,11 +17,18 @@ namespace Dawn {
XML_PARSE_STATE_LOOKING_FOR_ATTRIBUTE_VALUE,
XML_PARSE_STATE_PARSING_ATTRIBUTE_VALUE,
XML_PARSE_STATE_PARSING_VALUE,
XML_PARSE_STATE_PARSING_CHILD,
XML_PARSE_STATE_PARSING_CLOSE,
XML_PARSE_STATE_PARSING_COMMENT
};
class Xml;
struct XmlNode;
enum XmlNodeType {
XML_NODE_TYPE_TEXT,
XML_NODE_TYPE_ELEMENT
};
class Xml {
protected:
static bool_t isWhitespace(char_t c);
@ -31,10 +38,13 @@ namespace Dawn {
static void load(Xml *xml, std::string data, size_t *j);
std::string node;
std::string value;
std::string innerXml;
std::string outerXml;
std::string textContent;
std::map<std::string, std::string> attributes;
std::vector<struct XmlNode> childNodes;
// @deprecated
std::vector<Xml*> children;
std::vector<Xml*> getChildrenOfType(std::string type);
@ -42,4 +52,12 @@ namespace Dawn {
~Xml();
};
struct XmlNode {
enum XmlNodeType nodeType;
std::string value;
Xml *child;
XmlNode();
};
}