Charset parsing from argument string

2024-06-01 23:12:59 +02:00 · 2024-06-01 23:12:59 +02:00 · c27de5988d
parent d120bb3e05
commit c27de5988d
5 changed files with 267 additions and 184 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,15 +1,23 @@

-## Version 1.3 (forthcoming)
+## Version 1.3 (2024-06-01)

- Updated to MSDFgen 1.10
+- Updated to MSDFgen 1.12
 - Switched to vcpkg as the primary dependency management system
 - Removed Visual Studio solution and Makefile - now has to be generated by CMake
 - CMake configuration overhaul, added installation configuration
 - Switched to libpng as the primary PNG file encoder
+- Added uniform grid mode (`-uniformgrid`) where atlas is laid out in a rectangular grid
+- Added options to add extra padding around glyphs (`-empadding`, `-pxpadding` and similar)
+- Added the possibility to specify asymmetrical distance range (`-aemrange`, `-apxrange`)
+- Added `-pxalign` option which governs glyph alignment with the pixel grid
+- Added `-allglyphs` option as alternative to explicit charset / glyphset
+- Added `-chars` and `-glyphs` options to specify charset / glyphset directly on command line
 - Added `-varfont` option to configure variables of variable fonts
 - Added `-version` option to print program version
 - Arguments with double dash (e.g. `--font`) now also accepted
 - Minor fix to positioning for `-type hardmask`
+- Errors are now reported to `stderr`
+- TinyXML 2 no longer required as a dependency

 ### Version 1.2.2 (2021-09-06)

--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ This is a utility for generating compact font atlases using [MSDFgen](https://gi

 The atlas generator loads a subset of glyphs from a TTF or OTF font file, generates a distance field for each of them, and tightly packs them into an atlas bitmap (example below). The finished atlas and/or its layout metadata can be exported as an [Artery Font](https://github.com/Chlumsky/artery-font-format) file, a plain image file, a CSV sheet or a structured JSON file.

-![Atlas example](https://user-images.githubusercontent.com/18639794/76163889-811f2e80-614a-11ea-9b28-1eed54dbb899.png)
+![Atlas example](https://github.com/Chlumsky/msdf-atlas-gen/assets/18639794/ee8bfc77-7d36-4cbb-82df-aa8a02424b4a)

 A font atlas is typically stored in texture memory and used to draw text in real-time rendering contexts such as video games.

@ -47,13 +47,16 @@ Use the following command line arguments for the standalone version of the atlas

 - `-font <fontfile.ttf/otf>` (required) &ndash; sets the input font file.
  - Alternatively, use `-varfont <fontfile.ttf/otf?var0=value0&var1=value1>` to configure a variable font.
- `-charset <charset.txt>` &ndash; sets the character set. The ASCII charset will be used if not specified. See [the syntax specification](#character-set-specification-syntax) of `charset.txt`.
+- `-charset <charset.txt>` &ndash; sets the character set. See [the syntax specification](#character-set-specification-syntax) of `charset.txt`.
 - `-glyphset <glyphset.txt>` &ndash; sets the set of input glyphs using their indices within the font file. See [the syntax specification](#glyph-set-specification).
+- `-chars` / `-glyphs <set string>` sets the above character / glyph set in-line. See [the syntax specification](#character-set-specification-syntax).
 - `-allglyphs` &ndash; sets the set of input glyphs to all glyphs present within the font file.
 - `-fontscale <scale>` &ndash; applies a scaling transformation to the font's glyphs. Mainly to be used to generate multiple sizes in a single atlas, otherwise use [`-size`](#glyph-configuration).
 - `-fontname <name>` &ndash; sets a name for the font that will be stored in certain output files as metadata.
 - `-and` &ndash; separates multiple inputs to be combined into a single atlas.

+If no character set or glyph set is provided, and `-allglyphs` is not used, the ASCII charset will be used.
+
 ### Bitmap atlas type

 `-type <type>` &ndash; see [Atlas types](#atlas-types)
--- a/msdf-atlas-gen/Charset.h
+++ b/msdf-atlas-gen/Charset.h
@ -28,8 +28,10 @@ public:
    std::set<unicode_t>::const_iterator begin() const;
    std::set<unicode_t>::const_iterator end() const;

-    /// Load character set from a text file with the correct syntax
+    /// Load character set from a text file with compliant syntax
    bool load(const char *filename, bool disableCharLiterals = false);
+    /// Parse character set from a string with compliant syntax
+    bool parse(const char *str, size_t strLength, bool disableCharLiterals = false);

 private:
    std::set<unicode_t> codepoints;
--- a/msdf-atlas-gen/charset-parser.cpp
+++ b/msdf-atlas-gen/charset-parser.cpp
@ -25,36 +25,6 @@ static char escapedChar(char c) {
    }
 }

-static int readWord(std::string &str, FILE *f) {
-    while (true) {
-        int c = fgetc(f);
-        if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
-            str.push_back((char) c);
-        else
-            return c;
-    }
-}
-
-static bool readString(std::string &str, FILE *f, char terminator) {
-    bool escape = false;
-    while (true) {
-        int c = fgetc(f);
-        if (c < 0)
-            return false;
-        if (escape) {
-            str.push_back(escapedChar((char) c));
-            escape = false;
-        } else {
-            if (c == terminator)
-                return true;
-            else if (c == '\\')
-                escape = true;
-            else
-                str.push_back((char) c);
-        }
-    }
-}
-
 static bool parseInt(int &i, const char *str) {
    i = 0;
    if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) { // hex
@ -84,6 +54,181 @@ static bool parseInt(int &i, const char *str) {
    return true;
 }

+template <int (READ_CHAR)(void *)>
+static int readWord(void *userData, std::string &str) {
+    while (true) {
+        int c = READ_CHAR(userData);
+        if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
+            str.push_back((char) c);
+        else
+            return c;
+    }
+}
+
+template <int (READ_CHAR)(void *)>
+static bool readString(void *userData, std::string &str, char terminator) {
+    bool escape = false;
+    while (true) {
+        int c = READ_CHAR(userData);
+        if (c < 0)
+            return false;
+        if (escape) {
+            str.push_back(escapedChar((char) c));
+            escape = false;
+        } else {
+            if (c == terminator)
+                return true;
+            else if (c == '\\')
+                escape = true;
+            else
+                str.push_back((char) c);
+        }
+    }
+}
+
+template <int (READ_CHAR)(void *), void (ADD)(void *, unicode_t), bool (INCLUDE)(void *, const std::string &)>
+static bool charsetParse(void *userData, bool disableCharLiterals, bool disableInclude) {
+
+    enum {
+        CLEAR,
+        TIGHT,
+        RANGE_BRACKET,
+        RANGE_START,
+        RANGE_SEPARATOR,
+        RANGE_END
+    } state = CLEAR;
+
+    std::string buffer;
+    std::vector<unicode_t> unicodeBuffer;
+    unicode_t rangeStart = 0;
+    for (int c = READ_CHAR(userData), start = true; c >= 0; start = false) {
+        switch (c) {
+            case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // number
+                if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR))
+                    return false;
+                buffer.push_back((char) c);
+                c = readWord<READ_CHAR>(userData, buffer);
+                {
+                    int cp;
+                    if (!parseInt(cp, buffer.c_str()))
+                        return false;
+                    switch (state) {
+                        case CLEAR:
+                            if (cp >= 0)
+                                ADD(userData, (unicode_t) cp);
+                            state = TIGHT;
+                            break;
+                        case RANGE_BRACKET:
+                            rangeStart = (unicode_t) cp;
+                            state = RANGE_START;
+                            break;
+                        case RANGE_SEPARATOR:
+                            for (unicode_t u = rangeStart; (int) u <= cp; ++u)
+                                ADD(userData, u);
+                            state = RANGE_END;
+                            break;
+                        default:;
+                    }
+                }
+                buffer.clear();
+                continue; // next character already read
+            case '\'': // single UTF-8 character
+                if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR) || disableCharLiterals)
+                    return false;
+                if (!readString<READ_CHAR>(userData, buffer, '\''))
+                    return false;
+                utf8Decode(unicodeBuffer, buffer.c_str());
+                if (unicodeBuffer.size() == 1) {
+                    switch (state) {
+                        case CLEAR:
+                            if (unicodeBuffer[0] > 0)
+                                ADD(userData, unicodeBuffer[0]);
+                            state = TIGHT;
+                            break;
+                        case RANGE_BRACKET:
+                            rangeStart = unicodeBuffer[0];
+                            state = RANGE_START;
+                            break;
+                        case RANGE_SEPARATOR:
+                            for (unicode_t u = rangeStart; u <= unicodeBuffer[0]; ++u)
+                                ADD(userData, u);
+                            state = RANGE_END;
+                            break;
+                        default:;
+                    }
+                } else
+                    return false;
+                unicodeBuffer.clear();
+                buffer.clear();
+                break;
+            case '"': // string of UTF-8 characters
+                if (state != CLEAR || disableCharLiterals)
+                    return false;
+                if (!readString<READ_CHAR>(userData, buffer, '"'))
+                    return false;
+                utf8Decode(unicodeBuffer, buffer.c_str());
+                for (unicode_t cp : unicodeBuffer)
+                    ADD(userData, cp);
+                unicodeBuffer.clear();
+                buffer.clear();
+                state = TIGHT;
+                break;
+            case '[': // character range start
+                if (state != CLEAR)
+                    return false;
+                state = RANGE_BRACKET;
+                break;
+            case ']': // character range end
+                if (state == RANGE_END)
+                    state = TIGHT;
+                else
+                    return false;
+                break;
+            case '@': // annotation
+                if (state != CLEAR)
+                    return false;
+                c = readWord<READ_CHAR>(userData, buffer);
+                if (buffer == "include") {
+                    while (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+                        c = READ_CHAR(userData);
+                    if (c != '"')
+                        return false;
+                    buffer.clear();
+                    if (!readString<READ_CHAR>(userData, buffer, '"'))
+                        return false;
+                    INCLUDE(userData, buffer);
+                    state = TIGHT;
+                } else
+                    return false;
+                buffer.clear();
+                break;
+            case ',': case ';': // separator
+                if (!(state == CLEAR || state == TIGHT)) {
+                    if (state == RANGE_START)
+                        state = RANGE_SEPARATOR;
+                    else
+                        return false;
+                } // else treat as whitespace
+                // fallthrough
+            case ' ': case '\n': case '\r': case '\t': // whitespace
+                if (state == TIGHT)
+                    state = CLEAR;
+                break;
+            case 0xef: // UTF-8 byte order mark
+                if (start) {
+                    if (!(READ_CHAR(userData) == 0xbb && READ_CHAR(userData) == 0xbf))
+                        return false;
+                    break;
+                }
+            default: // unexpected character
+                return false;
+        }
+        c = READ_CHAR(userData);
+    }
+
+    return state == CLEAR || state == TIGHT;
+}
+
 static std::string combinePath(const char *basePath, const char *relPath) {
    if (relPath[0] == '/' || (relPath[0] && relPath[1] == ':')) // absolute path?
        return relPath;
@ -96,156 +241,57 @@ static std::string combinePath(const char *basePath, const char *relPath) {
    return std::string(basePath, lastSlash+1)+relPath;
 }

-bool Charset::load(const char *filename, bool disableCharLiterals) {
+struct CharsetLoadData {
+    Charset *charset;
+    const char *filename;
+    bool disableCharLiterals;
+    FILE *file;

-    if (FILE *f = fopen(filename, "rb")) {
-
-        enum {
-            CLEAR,
-            TIGHT,
-            RANGE_BRACKET,
-            RANGE_START,
-            RANGE_SEPARATOR,
-            RANGE_END
-        } state = CLEAR;
-
-        std::string buffer;
-        std::vector<unicode_t> unicodeBuffer;
-        unicode_t rangeStart = 0;
-        for (int c = fgetc(f), start = true; c >= 0; start = false) {
-            switch (c) {
-                case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // number
-                    if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR))
-                        goto FAIL;
-                    buffer.push_back((char) c);
-                    c = readWord(buffer, f);
-                    {
-                        int cp;
-                        if (!parseInt(cp, buffer.c_str()))
-                            goto FAIL;
-                        switch (state) {
-                            case CLEAR:
-                                if (cp >= 0)
-                                    add((unicode_t) cp);
-                                state = TIGHT;
-                                break;
-                            case RANGE_BRACKET:
-                                rangeStart = (unicode_t) cp;
-                                state = RANGE_START;
-                                break;
-                            case RANGE_SEPARATOR:
-                                for (unicode_t u = rangeStart; (int) u <= cp; ++u)
-                                    add(u);
-                                state = RANGE_END;
-                                break;
-                            default:;
-                        }
-                    }
-                    buffer.clear();
-                    continue; // next character already read
-                case '\'': // single UTF-8 character
-                    if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR) || disableCharLiterals)
-                        goto FAIL;
-                    if (!readString(buffer, f, '\''))
-                        goto FAIL;
-                    utf8Decode(unicodeBuffer, buffer.c_str());
-                    if (unicodeBuffer.size() == 1) {
-                        switch (state) {
-                            case CLEAR:
-                                if (unicodeBuffer[0] > 0)
-                                    add(unicodeBuffer[0]);
-                                state = TIGHT;
-                                break;
-                            case RANGE_BRACKET:
-                                rangeStart = unicodeBuffer[0];
-                                state = RANGE_START;
-                                break;
-                            case RANGE_SEPARATOR:
-                                for (unicode_t u = rangeStart; u <= unicodeBuffer[0]; ++u)
-                                    add(u);
-                                state = RANGE_END;
-                                break;
-                            default:;
-                        }
-                    } else
-                        goto FAIL;
-                    unicodeBuffer.clear();
-                    buffer.clear();
-                    break;
-                case '"': // string of UTF-8 characters
-                    if (state != CLEAR || disableCharLiterals)
-                        goto FAIL;
-                    if (!readString(buffer, f, '"'))
-                        goto FAIL;
-                    utf8Decode(unicodeBuffer, buffer.c_str());
-                    for (unicode_t cp : unicodeBuffer)
-                        add(cp);
-                    unicodeBuffer.clear();
-                    buffer.clear();
-                    state = TIGHT;
-                    break;
-                case '[': // character range start
-                    if (state != CLEAR)
-                        goto FAIL;
-                    state = RANGE_BRACKET;
-                    break;
-                case ']': // character range end
-                    if (state == RANGE_END)
-                        state = TIGHT;
-                    else
-                        goto FAIL;
-                    break;
-                case '@': // annotation
-                    if (state != CLEAR)
-                        goto FAIL;
-                    c = readWord(buffer, f);
-                    if (buffer == "include") {
-                        while (c == ' ' || c == '\t' || c == '\n' || c == '\r')
-                            c = fgetc(f);
-                        if (c != '"')
-                            goto FAIL;
-                        buffer.clear();
-                        if (!readString(buffer, f, '"'))
-                            goto FAIL;
-                        load(combinePath(filename, buffer.c_str()).c_str());
-                        state = TIGHT;
-                    } else
-                        goto FAIL;
-                    buffer.clear();
-                    break;
-                case ',': case ';': // separator
-                    if (!(state == CLEAR || state == TIGHT)) {
-                        if (state == RANGE_START)
-                            state = RANGE_SEPARATOR;
-                        else
-                            goto FAIL;
-                    } // else treat as whitespace
-                    // fallthrough
-                case ' ': case '\n': case '\r': case '\t': // whitespace
-                    if (state == TIGHT)
-                        state = CLEAR;
-                    break;
-                case 0xef: // UTF-8 byte order mark
-                    if (start) {
-                        if (!(fgetc(f) == 0xbb && fgetc(f) == 0xbf))
-                            goto FAIL;
-                        break;
-                    }
-                default: // unexpected character
-                    goto FAIL;
-            }
-            c = fgetc(f);
-        }
-
-        fclose(f);
-        return state == CLEAR || state == TIGHT;
-
-    FAIL:
-        fclose(f);
-        return false;
+    static int readChar(void *userData) {
+        return fgetc(reinterpret_cast<CharsetLoadData *>(userData)->file);
    }

+    static void add(void *userData, unicode_t cp) {
+        reinterpret_cast<CharsetLoadData *>(userData)->charset->add(cp);
+    }
+
+    static bool include(void *userData, const std::string &path) {
+        const CharsetLoadData &ud = *reinterpret_cast<CharsetLoadData *>(userData);
+        return ud.charset->load(combinePath(ud.filename, path.c_str()).c_str(), ud.disableCharLiterals);
+    }
+};
+
+bool Charset::load(const char *filename, bool disableCharLiterals) {
+    if (FILE *f = fopen(filename, "rb")) {
+        CharsetLoadData userData = { this, filename, disableCharLiterals, f };
+        bool success = charsetParse<CharsetLoadData::readChar, CharsetLoadData::add, CharsetLoadData::include>(&userData, disableCharLiterals, false);
+        fclose(f);
+        return success;
+    }
    return false;
 }

+struct CharsetParseData {
+    Charset *charset;
+    const char *cur, *end;
+
+    static int readChar(void *userData) {
+        CharsetParseData &ud = *reinterpret_cast<CharsetParseData *>(userData);
+        return ud.cur < ud.end ? (int) (unsigned char) *ud.cur++ : -1;
+    }
+
+    static void add(void *userData, unicode_t cp) {
+        reinterpret_cast<CharsetParseData *>(userData)->charset->add(cp);
+    }
+
+    static bool include(void *, const std::string &) {
+        return false;
+    }
+};
+
+bool Charset::parse(const char *str, size_t strLength, bool disableCharLiterals) {
+    CharsetParseData userData = { this, str, str+strLength };
+    return charsetParse<CharsetParseData::readChar, CharsetParseData::add, CharsetParseData::include>(&userData, disableCharLiterals, true);
+}
+
 }
--- a/msdf-atlas-gen/main.cpp
+++ b/msdf-atlas-gen/main.cpp
@ -69,6 +69,10 @@ R"(
      Specifies the input character set. Refer to the documentation for format of charset specification. Defaults to ASCII.
  -glyphset <filename>
      Specifies the set of input glyphs as glyph indices within the font file.
+  -chars <charset specification>
+      Specifies the input character set in-line. Refer to documentation for its syntax.
+  -glyphs <glyph set specification>
+      Specifies the set of glyph indices in-line. Refer to documentation for its syntax.
  -allglyphs
      Specifies that all glyphs within the font file are to be processed.
  -fontscale <scale>
@ -290,6 +294,7 @@ struct FontInput {
    bool variableFont;
    GlyphIdentifierType glyphIdentifierType;
    const char *charsetFilename;
+    const char *charsetString;
    double fontScale;
    const char *fontName;
 };
@ -487,16 +492,31 @@ int main(int argc, const char *const *argv) {
    #endif
        ARG_CASE("-charset", 1) {
            fontInput.charsetFilename = argv[argPos++];
+            fontInput.charsetString = nullptr;
            fontInput.glyphIdentifierType = GlyphIdentifierType::UNICODE_CODEPOINT;
            continue;
        }
        ARG_CASE("-glyphset", 1) {
            fontInput.charsetFilename = argv[argPos++];
+            fontInput.charsetString = nullptr;
+            fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX;
+            continue;
+        }
+        ARG_CASE("-chars", 1) {
+            fontInput.charsetFilename = nullptr;
+            fontInput.charsetString = argv[argPos++];
+            fontInput.glyphIdentifierType = GlyphIdentifierType::UNICODE_CODEPOINT;
+            continue;
+        }
+        ARG_CASE("-glyphs", 1) {
+            fontInput.charsetFilename = nullptr;
+            fontInput.charsetString = argv[argPos++];
            fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX;
            continue;
        }
        ARG_CASE("-allglyphs", 0) {
            fontInput.charsetFilename = nullptr;
+            fontInput.charsetString = nullptr;
            fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX;
            continue;
        }
@ -512,7 +532,7 @@ int main(int argc, const char *const *argv) {
            continue;
        }
        ARG_CASE("-and", 0) {
-            if (!fontInput.fontFilename && !fontInput.charsetFilename && fontInput.fontScale < 0)
+            if (!fontInput.fontFilename && !fontInput.charsetFilename && !fontInput.charsetString && fontInput.fontScale < 0)
                ABORT("No font, character set, or font scale specified before -and separator.");
            if (!fontInputs.empty() && !memcmp(&fontInputs.back(), &fontInput, sizeof(FontInput)))
                ABORT("No changes between subsequent inputs. A different font, character set, or font scale must be set inbetween -and separators.");
@ -926,8 +946,9 @@ int main(int argc, const char *const *argv) {
    for (std::vector<FontInput>::reverse_iterator it = fontInputs.rbegin(); it != fontInputs.rend(); ++it) {
        if (!it->fontFilename && nextFontInput->fontFilename)
            it->fontFilename = nextFontInput->fontFilename;
-        if (!it->charsetFilename && nextFontInput->charsetFilename) {
+        if (!(it->charsetFilename || it->charsetString || it->glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX) && (nextFontInput->charsetFilename || nextFontInput->charsetString || nextFontInput->glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX)) {
            it->charsetFilename = nextFontInput->charsetFilename;
+            it->charsetString = nextFontInput->charsetString;
            it->glyphIdentifierType = nextFontInput->glyphIdentifierType;
        }
        if (it->fontScale < 0 && nextFontInput->fontScale >= 0)
@ -1100,6 +1121,9 @@ int main(int argc, const char *const *argv) {
            if (fontInput.charsetFilename) {
                if (!charset.load(fontInput.charsetFilename, fontInput.glyphIdentifierType != GlyphIdentifierType::UNICODE_CODEPOINT))
                    ABORT(fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX ? "Failed to load glyph set specification." : "Failed to load character set specification.");
+            } else if (fontInput.charsetString) {
+                if (!charset.parse(fontInput.charsetString, strlen(fontInput.charsetString), fontInput.glyphIdentifierType != GlyphIdentifierType::UNICODE_CODEPOINT))
+                    ABORT(fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX ? "Failed to parse glyph set specification." : "Failed to parse character set specification.");
            } else if (fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX)
                msdfgen::getGlyphCount(allGlyphCount, font);
            else
@ -1411,7 +1435,7 @@ int main(int argc, const char *const *argv) {
            }
        } else {
            result = 1;
-            fputs("Shadron preview not supported in -glyphset mode.\n", stderr);
+            fputs("Shadron preview not supported in glyph set mode.\n", stderr);
        }
    }