From c27de5988d7ecfbc9936ee5f936429e2dbc077b9 Mon Sep 17 00:00:00 2001 From: Chlumsky Date: Sat, 1 Jun 2024 23:12:59 +0200 Subject: [PATCH] Charset parsing from argument string --- CHANGELOG.md | 12 +- README.md | 7 +- msdf-atlas-gen/Charset.h | 4 +- msdf-atlas-gen/charset-parser.cpp | 398 +++++++++++++++++------------- msdf-atlas-gen/main.cpp | 30 ++- 5 files changed, 267 insertions(+), 184 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8272677..9670d81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,23 @@ -## Version 1.3 (forthcoming) +## Version 1.3 (2024-06-01) -- Updated to MSDFgen 1.10 +- Updated to MSDFgen 1.12 - Switched to vcpkg as the primary dependency management system - Removed Visual Studio solution and Makefile - now has to be generated by CMake - CMake configuration overhaul, added installation configuration - Switched to libpng as the primary PNG file encoder +- Added uniform grid mode (`-uniformgrid`) where atlas is laid out in a rectangular grid +- Added options to add extra padding around glyphs (`-empadding`, `-pxpadding` and similar) +- Added the possibility to specify asymmetrical distance range (`-aemrange`, `-apxrange`) +- Added `-pxalign` option which governs glyph alignment with the pixel grid +- Added `-allglyphs` option as alternative to explicit charset / glyphset +- Added `-chars` and `-glyphs` options to specify charset / glyphset directly on command line - Added `-varfont` option to configure variables of variable fonts - Added `-version` option to print program version - Arguments with double dash (e.g. `--font`) now also accepted - Minor fix to positioning for `-type hardmask` +- Errors are now reported to `stderr` +- TinyXML 2 no longer required as a dependency ### Version 1.2.2 (2021-09-06) diff --git a/README.md b/README.md index d3b9391..a494eb0 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This is a utility for generating compact font atlases using [MSDFgen](https://gi The atlas generator loads a subset of glyphs from a TTF or OTF font file, generates a distance field for each of them, and tightly packs them into an atlas bitmap (example below). The finished atlas and/or its layout metadata can be exported as an [Artery Font](https://github.com/Chlumsky/artery-font-format) file, a plain image file, a CSV sheet or a structured JSON file. -![Atlas example](https://user-images.githubusercontent.com/18639794/76163889-811f2e80-614a-11ea-9b28-1eed54dbb899.png) +![Atlas example](https://github.com/Chlumsky/msdf-atlas-gen/assets/18639794/ee8bfc77-7d36-4cbb-82df-aa8a02424b4a) A font atlas is typically stored in texture memory and used to draw text in real-time rendering contexts such as video games. @@ -47,13 +47,16 @@ Use the following command line arguments for the standalone version of the atlas - `-font ` (required) – sets the input font file. - Alternatively, use `-varfont ` to configure a variable font. -- `-charset ` – sets the character set. The ASCII charset will be used if not specified. See [the syntax specification](#character-set-specification-syntax) of `charset.txt`. +- `-charset ` – sets the character set. See [the syntax specification](#character-set-specification-syntax) of `charset.txt`. - `-glyphset ` – sets the set of input glyphs using their indices within the font file. See [the syntax specification](#glyph-set-specification). +- `-chars` / `-glyphs ` sets the above character / glyph set in-line. See [the syntax specification](#character-set-specification-syntax). - `-allglyphs` – sets the set of input glyphs to all glyphs present within the font file. - `-fontscale ` – applies a scaling transformation to the font's glyphs. Mainly to be used to generate multiple sizes in a single atlas, otherwise use [`-size`](#glyph-configuration). - `-fontname ` – sets a name for the font that will be stored in certain output files as metadata. - `-and` – separates multiple inputs to be combined into a single atlas. +If no character set or glyph set is provided, and `-allglyphs` is not used, the ASCII charset will be used. + ### Bitmap atlas type `-type ` – see [Atlas types](#atlas-types) diff --git a/msdf-atlas-gen/Charset.h b/msdf-atlas-gen/Charset.h index 56e011a..4589b7c 100644 --- a/msdf-atlas-gen/Charset.h +++ b/msdf-atlas-gen/Charset.h @@ -28,8 +28,10 @@ public: std::set::const_iterator begin() const; std::set::const_iterator end() const; - /// Load character set from a text file with the correct syntax + /// Load character set from a text file with compliant syntax bool load(const char *filename, bool disableCharLiterals = false); + /// Parse character set from a string with compliant syntax + bool parse(const char *str, size_t strLength, bool disableCharLiterals = false); private: std::set codepoints; diff --git a/msdf-atlas-gen/charset-parser.cpp b/msdf-atlas-gen/charset-parser.cpp index c74f68f..616129f 100644 --- a/msdf-atlas-gen/charset-parser.cpp +++ b/msdf-atlas-gen/charset-parser.cpp @@ -25,36 +25,6 @@ static char escapedChar(char c) { } } -static int readWord(std::string &str, FILE *f) { - while (true) { - int c = fgetc(f); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_') - str.push_back((char) c); - else - return c; - } -} - -static bool readString(std::string &str, FILE *f, char terminator) { - bool escape = false; - while (true) { - int c = fgetc(f); - if (c < 0) - return false; - if (escape) { - str.push_back(escapedChar((char) c)); - escape = false; - } else { - if (c == terminator) - return true; - else if (c == '\\') - escape = true; - else - str.push_back((char) c); - } - } -} - static bool parseInt(int &i, const char *str) { i = 0; if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) { // hex @@ -84,6 +54,181 @@ static bool parseInt(int &i, const char *str) { return true; } +template +static int readWord(void *userData, std::string &str) { + while (true) { + int c = READ_CHAR(userData); + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_') + str.push_back((char) c); + else + return c; + } +} + +template +static bool readString(void *userData, std::string &str, char terminator) { + bool escape = false; + while (true) { + int c = READ_CHAR(userData); + if (c < 0) + return false; + if (escape) { + str.push_back(escapedChar((char) c)); + escape = false; + } else { + if (c == terminator) + return true; + else if (c == '\\') + escape = true; + else + str.push_back((char) c); + } + } +} + +template +static bool charsetParse(void *userData, bool disableCharLiterals, bool disableInclude) { + + enum { + CLEAR, + TIGHT, + RANGE_BRACKET, + RANGE_START, + RANGE_SEPARATOR, + RANGE_END + } state = CLEAR; + + std::string buffer; + std::vector unicodeBuffer; + unicode_t rangeStart = 0; + for (int c = READ_CHAR(userData), start = true; c >= 0; start = false) { + switch (c) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // number + if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR)) + return false; + buffer.push_back((char) c); + c = readWord(userData, buffer); + { + int cp; + if (!parseInt(cp, buffer.c_str())) + return false; + switch (state) { + case CLEAR: + if (cp >= 0) + ADD(userData, (unicode_t) cp); + state = TIGHT; + break; + case RANGE_BRACKET: + rangeStart = (unicode_t) cp; + state = RANGE_START; + break; + case RANGE_SEPARATOR: + for (unicode_t u = rangeStart; (int) u <= cp; ++u) + ADD(userData, u); + state = RANGE_END; + break; + default:; + } + } + buffer.clear(); + continue; // next character already read + case '\'': // single UTF-8 character + if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR) || disableCharLiterals) + return false; + if (!readString(userData, buffer, '\'')) + return false; + utf8Decode(unicodeBuffer, buffer.c_str()); + if (unicodeBuffer.size() == 1) { + switch (state) { + case CLEAR: + if (unicodeBuffer[0] > 0) + ADD(userData, unicodeBuffer[0]); + state = TIGHT; + break; + case RANGE_BRACKET: + rangeStart = unicodeBuffer[0]; + state = RANGE_START; + break; + case RANGE_SEPARATOR: + for (unicode_t u = rangeStart; u <= unicodeBuffer[0]; ++u) + ADD(userData, u); + state = RANGE_END; + break; + default:; + } + } else + return false; + unicodeBuffer.clear(); + buffer.clear(); + break; + case '"': // string of UTF-8 characters + if (state != CLEAR || disableCharLiterals) + return false; + if (!readString(userData, buffer, '"')) + return false; + utf8Decode(unicodeBuffer, buffer.c_str()); + for (unicode_t cp : unicodeBuffer) + ADD(userData, cp); + unicodeBuffer.clear(); + buffer.clear(); + state = TIGHT; + break; + case '[': // character range start + if (state != CLEAR) + return false; + state = RANGE_BRACKET; + break; + case ']': // character range end + if (state == RANGE_END) + state = TIGHT; + else + return false; + break; + case '@': // annotation + if (state != CLEAR) + return false; + c = readWord(userData, buffer); + if (buffer == "include") { + while (c == ' ' || c == '\t' || c == '\n' || c == '\r') + c = READ_CHAR(userData); + if (c != '"') + return false; + buffer.clear(); + if (!readString(userData, buffer, '"')) + return false; + INCLUDE(userData, buffer); + state = TIGHT; + } else + return false; + buffer.clear(); + break; + case ',': case ';': // separator + if (!(state == CLEAR || state == TIGHT)) { + if (state == RANGE_START) + state = RANGE_SEPARATOR; + else + return false; + } // else treat as whitespace + // fallthrough + case ' ': case '\n': case '\r': case '\t': // whitespace + if (state == TIGHT) + state = CLEAR; + break; + case 0xef: // UTF-8 byte order mark + if (start) { + if (!(READ_CHAR(userData) == 0xbb && READ_CHAR(userData) == 0xbf)) + return false; + break; + } + default: // unexpected character + return false; + } + c = READ_CHAR(userData); + } + + return state == CLEAR || state == TIGHT; +} + static std::string combinePath(const char *basePath, const char *relPath) { if (relPath[0] == '/' || (relPath[0] && relPath[1] == ':')) // absolute path? return relPath; @@ -96,156 +241,57 @@ static std::string combinePath(const char *basePath, const char *relPath) { return std::string(basePath, lastSlash+1)+relPath; } -bool Charset::load(const char *filename, bool disableCharLiterals) { +struct CharsetLoadData { + Charset *charset; + const char *filename; + bool disableCharLiterals; + FILE *file; - if (FILE *f = fopen(filename, "rb")) { - - enum { - CLEAR, - TIGHT, - RANGE_BRACKET, - RANGE_START, - RANGE_SEPARATOR, - RANGE_END - } state = CLEAR; - - std::string buffer; - std::vector unicodeBuffer; - unicode_t rangeStart = 0; - for (int c = fgetc(f), start = true; c >= 0; start = false) { - switch (c) { - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // number - if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR)) - goto FAIL; - buffer.push_back((char) c); - c = readWord(buffer, f); - { - int cp; - if (!parseInt(cp, buffer.c_str())) - goto FAIL; - switch (state) { - case CLEAR: - if (cp >= 0) - add((unicode_t) cp); - state = TIGHT; - break; - case RANGE_BRACKET: - rangeStart = (unicode_t) cp; - state = RANGE_START; - break; - case RANGE_SEPARATOR: - for (unicode_t u = rangeStart; (int) u <= cp; ++u) - add(u); - state = RANGE_END; - break; - default:; - } - } - buffer.clear(); - continue; // next character already read - case '\'': // single UTF-8 character - if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR) || disableCharLiterals) - goto FAIL; - if (!readString(buffer, f, '\'')) - goto FAIL; - utf8Decode(unicodeBuffer, buffer.c_str()); - if (unicodeBuffer.size() == 1) { - switch (state) { - case CLEAR: - if (unicodeBuffer[0] > 0) - add(unicodeBuffer[0]); - state = TIGHT; - break; - case RANGE_BRACKET: - rangeStart = unicodeBuffer[0]; - state = RANGE_START; - break; - case RANGE_SEPARATOR: - for (unicode_t u = rangeStart; u <= unicodeBuffer[0]; ++u) - add(u); - state = RANGE_END; - break; - default:; - } - } else - goto FAIL; - unicodeBuffer.clear(); - buffer.clear(); - break; - case '"': // string of UTF-8 characters - if (state != CLEAR || disableCharLiterals) - goto FAIL; - if (!readString(buffer, f, '"')) - goto FAIL; - utf8Decode(unicodeBuffer, buffer.c_str()); - for (unicode_t cp : unicodeBuffer) - add(cp); - unicodeBuffer.clear(); - buffer.clear(); - state = TIGHT; - break; - case '[': // character range start - if (state != CLEAR) - goto FAIL; - state = RANGE_BRACKET; - break; - case ']': // character range end - if (state == RANGE_END) - state = TIGHT; - else - goto FAIL; - break; - case '@': // annotation - if (state != CLEAR) - goto FAIL; - c = readWord(buffer, f); - if (buffer == "include") { - while (c == ' ' || c == '\t' || c == '\n' || c == '\r') - c = fgetc(f); - if (c != '"') - goto FAIL; - buffer.clear(); - if (!readString(buffer, f, '"')) - goto FAIL; - load(combinePath(filename, buffer.c_str()).c_str()); - state = TIGHT; - } else - goto FAIL; - buffer.clear(); - break; - case ',': case ';': // separator - if (!(state == CLEAR || state == TIGHT)) { - if (state == RANGE_START) - state = RANGE_SEPARATOR; - else - goto FAIL; - } // else treat as whitespace - // fallthrough - case ' ': case '\n': case '\r': case '\t': // whitespace - if (state == TIGHT) - state = CLEAR; - break; - case 0xef: // UTF-8 byte order mark - if (start) { - if (!(fgetc(f) == 0xbb && fgetc(f) == 0xbf)) - goto FAIL; - break; - } - default: // unexpected character - goto FAIL; - } - c = fgetc(f); - } - - fclose(f); - return state == CLEAR || state == TIGHT; - - FAIL: - fclose(f); - return false; + static int readChar(void *userData) { + return fgetc(reinterpret_cast(userData)->file); } + static void add(void *userData, unicode_t cp) { + reinterpret_cast(userData)->charset->add(cp); + } + + static bool include(void *userData, const std::string &path) { + const CharsetLoadData &ud = *reinterpret_cast(userData); + return ud.charset->load(combinePath(ud.filename, path.c_str()).c_str(), ud.disableCharLiterals); + } +}; + +bool Charset::load(const char *filename, bool disableCharLiterals) { + if (FILE *f = fopen(filename, "rb")) { + CharsetLoadData userData = { this, filename, disableCharLiterals, f }; + bool success = charsetParse(&userData, disableCharLiterals, false); + fclose(f); + return success; + } return false; } +struct CharsetParseData { + Charset *charset; + const char *cur, *end; + + static int readChar(void *userData) { + CharsetParseData &ud = *reinterpret_cast(userData); + return ud.cur < ud.end ? (int) (unsigned char) *ud.cur++ : -1; + } + + static void add(void *userData, unicode_t cp) { + reinterpret_cast(userData)->charset->add(cp); + } + + static bool include(void *, const std::string &) { + return false; + } +}; + +bool Charset::parse(const char *str, size_t strLength, bool disableCharLiterals) { + CharsetParseData userData = { this, str, str+strLength }; + return charsetParse(&userData, disableCharLiterals, true); +} + } diff --git a/msdf-atlas-gen/main.cpp b/msdf-atlas-gen/main.cpp index 7a76058..66d68b1 100644 --- a/msdf-atlas-gen/main.cpp +++ b/msdf-atlas-gen/main.cpp @@ -69,6 +69,10 @@ R"( Specifies the input character set. Refer to the documentation for format of charset specification. Defaults to ASCII. -glyphset Specifies the set of input glyphs as glyph indices within the font file. + -chars + Specifies the input character set in-line. Refer to documentation for its syntax. + -glyphs + Specifies the set of glyph indices in-line. Refer to documentation for its syntax. -allglyphs Specifies that all glyphs within the font file are to be processed. -fontscale @@ -290,6 +294,7 @@ struct FontInput { bool variableFont; GlyphIdentifierType glyphIdentifierType; const char *charsetFilename; + const char *charsetString; double fontScale; const char *fontName; }; @@ -487,16 +492,31 @@ int main(int argc, const char *const *argv) { #endif ARG_CASE("-charset", 1) { fontInput.charsetFilename = argv[argPos++]; + fontInput.charsetString = nullptr; fontInput.glyphIdentifierType = GlyphIdentifierType::UNICODE_CODEPOINT; continue; } ARG_CASE("-glyphset", 1) { fontInput.charsetFilename = argv[argPos++]; + fontInput.charsetString = nullptr; + fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX; + continue; + } + ARG_CASE("-chars", 1) { + fontInput.charsetFilename = nullptr; + fontInput.charsetString = argv[argPos++]; + fontInput.glyphIdentifierType = GlyphIdentifierType::UNICODE_CODEPOINT; + continue; + } + ARG_CASE("-glyphs", 1) { + fontInput.charsetFilename = nullptr; + fontInput.charsetString = argv[argPos++]; fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX; continue; } ARG_CASE("-allglyphs", 0) { fontInput.charsetFilename = nullptr; + fontInput.charsetString = nullptr; fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX; continue; } @@ -512,7 +532,7 @@ int main(int argc, const char *const *argv) { continue; } ARG_CASE("-and", 0) { - if (!fontInput.fontFilename && !fontInput.charsetFilename && fontInput.fontScale < 0) + if (!fontInput.fontFilename && !fontInput.charsetFilename && !fontInput.charsetString && fontInput.fontScale < 0) ABORT("No font, character set, or font scale specified before -and separator."); if (!fontInputs.empty() && !memcmp(&fontInputs.back(), &fontInput, sizeof(FontInput))) ABORT("No changes between subsequent inputs. A different font, character set, or font scale must be set inbetween -and separators."); @@ -926,8 +946,9 @@ int main(int argc, const char *const *argv) { for (std::vector::reverse_iterator it = fontInputs.rbegin(); it != fontInputs.rend(); ++it) { if (!it->fontFilename && nextFontInput->fontFilename) it->fontFilename = nextFontInput->fontFilename; - if (!it->charsetFilename && nextFontInput->charsetFilename) { + if (!(it->charsetFilename || it->charsetString || it->glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX) && (nextFontInput->charsetFilename || nextFontInput->charsetString || nextFontInput->glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX)) { it->charsetFilename = nextFontInput->charsetFilename; + it->charsetString = nextFontInput->charsetString; it->glyphIdentifierType = nextFontInput->glyphIdentifierType; } if (it->fontScale < 0 && nextFontInput->fontScale >= 0) @@ -1100,6 +1121,9 @@ int main(int argc, const char *const *argv) { if (fontInput.charsetFilename) { if (!charset.load(fontInput.charsetFilename, fontInput.glyphIdentifierType != GlyphIdentifierType::UNICODE_CODEPOINT)) ABORT(fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX ? "Failed to load glyph set specification." : "Failed to load character set specification."); + } else if (fontInput.charsetString) { + if (!charset.parse(fontInput.charsetString, strlen(fontInput.charsetString), fontInput.glyphIdentifierType != GlyphIdentifierType::UNICODE_CODEPOINT)) + ABORT(fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX ? "Failed to parse glyph set specification." : "Failed to parse character set specification."); } else if (fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX) msdfgen::getGlyphCount(allGlyphCount, font); else @@ -1411,7 +1435,7 @@ int main(int argc, const char *const *argv) { } } else { result = 1; - fputs("Shadron preview not supported in -glyphset mode.\n", stderr); + fputs("Shadron preview not supported in glyph set mode.\n", stderr); } }