Charset parsing from argument string

This commit is contained in:
Chlumsky 2024-06-01 23:12:59 +02:00
parent d120bb3e05
commit c27de5988d
5 changed files with 267 additions and 184 deletions

View File

@ -1,15 +1,23 @@
## Version 1.3 (forthcoming)
## Version 1.3 (2024-06-01)
- Updated to MSDFgen 1.10
- Updated to MSDFgen 1.12
- Switched to vcpkg as the primary dependency management system
- Removed Visual Studio solution and Makefile - now has to be generated by CMake
- CMake configuration overhaul, added installation configuration
- Switched to libpng as the primary PNG file encoder
- Added uniform grid mode (`-uniformgrid`) where atlas is laid out in a rectangular grid
- Added options to add extra padding around glyphs (`-empadding`, `-pxpadding` and similar)
- Added the possibility to specify asymmetrical distance range (`-aemrange`, `-apxrange`)
- Added `-pxalign` option which governs glyph alignment with the pixel grid
- Added `-allglyphs` option as alternative to explicit charset / glyphset
- Added `-chars` and `-glyphs` options to specify charset / glyphset directly on command line
- Added `-varfont` option to configure variables of variable fonts
- Added `-version` option to print program version
- Arguments with double dash (e.g. `--font`) now also accepted
- Minor fix to positioning for `-type hardmask`
- Errors are now reported to `stderr`
- TinyXML 2 no longer required as a dependency
### Version 1.2.2 (2021-09-06)

View File

@ -5,7 +5,7 @@ This is a utility for generating compact font atlases using [MSDFgen](https://gi
The atlas generator loads a subset of glyphs from a TTF or OTF font file, generates a distance field for each of them, and tightly packs them into an atlas bitmap (example below). The finished atlas and/or its layout metadata can be exported as an [Artery Font](https://github.com/Chlumsky/artery-font-format) file, a plain image file, a CSV sheet or a structured JSON file.
![Atlas example](https://user-images.githubusercontent.com/18639794/76163889-811f2e80-614a-11ea-9b28-1eed54dbb899.png)
![Atlas example](https://github.com/Chlumsky/msdf-atlas-gen/assets/18639794/ee8bfc77-7d36-4cbb-82df-aa8a02424b4a)
A font atlas is typically stored in texture memory and used to draw text in real-time rendering contexts such as video games.
@ -47,13 +47,16 @@ Use the following command line arguments for the standalone version of the atlas
- `-font <fontfile.ttf/otf>` (required) &ndash; sets the input font file.
- Alternatively, use `-varfont <fontfile.ttf/otf?var0=value0&var1=value1>` to configure a variable font.
- `-charset <charset.txt>` &ndash; sets the character set. The ASCII charset will be used if not specified. See [the syntax specification](#character-set-specification-syntax) of `charset.txt`.
- `-charset <charset.txt>` &ndash; sets the character set. See [the syntax specification](#character-set-specification-syntax) of `charset.txt`.
- `-glyphset <glyphset.txt>` &ndash; sets the set of input glyphs using their indices within the font file. See [the syntax specification](#glyph-set-specification).
- `-chars` / `-glyphs <set string>` sets the above character / glyph set in-line. See [the syntax specification](#character-set-specification-syntax).
- `-allglyphs` &ndash; sets the set of input glyphs to all glyphs present within the font file.
- `-fontscale <scale>` &ndash; applies a scaling transformation to the font's glyphs. Mainly to be used to generate multiple sizes in a single atlas, otherwise use [`-size`](#glyph-configuration).
- `-fontname <name>` &ndash; sets a name for the font that will be stored in certain output files as metadata.
- `-and` &ndash; separates multiple inputs to be combined into a single atlas.
If no character set or glyph set is provided, and `-allglyphs` is not used, the ASCII charset will be used.
### Bitmap atlas type
`-type <type>` &ndash; see [Atlas types](#atlas-types)

View File

@ -28,8 +28,10 @@ public:
std::set<unicode_t>::const_iterator begin() const;
std::set<unicode_t>::const_iterator end() const;
/// Load character set from a text file with the correct syntax
/// Load character set from a text file with compliant syntax
bool load(const char *filename, bool disableCharLiterals = false);
/// Parse character set from a string with compliant syntax
bool parse(const char *str, size_t strLength, bool disableCharLiterals = false);
private:
std::set<unicode_t> codepoints;

View File

@ -25,36 +25,6 @@ static char escapedChar(char c) {
}
}
static int readWord(std::string &str, FILE *f) {
while (true) {
int c = fgetc(f);
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
str.push_back((char) c);
else
return c;
}
}
static bool readString(std::string &str, FILE *f, char terminator) {
bool escape = false;
while (true) {
int c = fgetc(f);
if (c < 0)
return false;
if (escape) {
str.push_back(escapedChar((char) c));
escape = false;
} else {
if (c == terminator)
return true;
else if (c == '\\')
escape = true;
else
str.push_back((char) c);
}
}
}
static bool parseInt(int &i, const char *str) {
i = 0;
if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) { // hex
@ -84,6 +54,181 @@ static bool parseInt(int &i, const char *str) {
return true;
}
template <int (READ_CHAR)(void *)>
static int readWord(void *userData, std::string &str) {
while (true) {
int c = READ_CHAR(userData);
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_')
str.push_back((char) c);
else
return c;
}
}
template <int (READ_CHAR)(void *)>
static bool readString(void *userData, std::string &str, char terminator) {
bool escape = false;
while (true) {
int c = READ_CHAR(userData);
if (c < 0)
return false;
if (escape) {
str.push_back(escapedChar((char) c));
escape = false;
} else {
if (c == terminator)
return true;
else if (c == '\\')
escape = true;
else
str.push_back((char) c);
}
}
}
template <int (READ_CHAR)(void *), void (ADD)(void *, unicode_t), bool (INCLUDE)(void *, const std::string &)>
static bool charsetParse(void *userData, bool disableCharLiterals, bool disableInclude) {
enum {
CLEAR,
TIGHT,
RANGE_BRACKET,
RANGE_START,
RANGE_SEPARATOR,
RANGE_END
} state = CLEAR;
std::string buffer;
std::vector<unicode_t> unicodeBuffer;
unicode_t rangeStart = 0;
for (int c = READ_CHAR(userData), start = true; c >= 0; start = false) {
switch (c) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // number
if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR))
return false;
buffer.push_back((char) c);
c = readWord<READ_CHAR>(userData, buffer);
{
int cp;
if (!parseInt(cp, buffer.c_str()))
return false;
switch (state) {
case CLEAR:
if (cp >= 0)
ADD(userData, (unicode_t) cp);
state = TIGHT;
break;
case RANGE_BRACKET:
rangeStart = (unicode_t) cp;
state = RANGE_START;
break;
case RANGE_SEPARATOR:
for (unicode_t u = rangeStart; (int) u <= cp; ++u)
ADD(userData, u);
state = RANGE_END;
break;
default:;
}
}
buffer.clear();
continue; // next character already read
case '\'': // single UTF-8 character
if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR) || disableCharLiterals)
return false;
if (!readString<READ_CHAR>(userData, buffer, '\''))
return false;
utf8Decode(unicodeBuffer, buffer.c_str());
if (unicodeBuffer.size() == 1) {
switch (state) {
case CLEAR:
if (unicodeBuffer[0] > 0)
ADD(userData, unicodeBuffer[0]);
state = TIGHT;
break;
case RANGE_BRACKET:
rangeStart = unicodeBuffer[0];
state = RANGE_START;
break;
case RANGE_SEPARATOR:
for (unicode_t u = rangeStart; u <= unicodeBuffer[0]; ++u)
ADD(userData, u);
state = RANGE_END;
break;
default:;
}
} else
return false;
unicodeBuffer.clear();
buffer.clear();
break;
case '"': // string of UTF-8 characters
if (state != CLEAR || disableCharLiterals)
return false;
if (!readString<READ_CHAR>(userData, buffer, '"'))
return false;
utf8Decode(unicodeBuffer, buffer.c_str());
for (unicode_t cp : unicodeBuffer)
ADD(userData, cp);
unicodeBuffer.clear();
buffer.clear();
state = TIGHT;
break;
case '[': // character range start
if (state != CLEAR)
return false;
state = RANGE_BRACKET;
break;
case ']': // character range end
if (state == RANGE_END)
state = TIGHT;
else
return false;
break;
case '@': // annotation
if (state != CLEAR)
return false;
c = readWord<READ_CHAR>(userData, buffer);
if (buffer == "include") {
while (c == ' ' || c == '\t' || c == '\n' || c == '\r')
c = READ_CHAR(userData);
if (c != '"')
return false;
buffer.clear();
if (!readString<READ_CHAR>(userData, buffer, '"'))
return false;
INCLUDE(userData, buffer);
state = TIGHT;
} else
return false;
buffer.clear();
break;
case ',': case ';': // separator
if (!(state == CLEAR || state == TIGHT)) {
if (state == RANGE_START)
state = RANGE_SEPARATOR;
else
return false;
} // else treat as whitespace
// fallthrough
case ' ': case '\n': case '\r': case '\t': // whitespace
if (state == TIGHT)
state = CLEAR;
break;
case 0xef: // UTF-8 byte order mark
if (start) {
if (!(READ_CHAR(userData) == 0xbb && READ_CHAR(userData) == 0xbf))
return false;
break;
}
default: // unexpected character
return false;
}
c = READ_CHAR(userData);
}
return state == CLEAR || state == TIGHT;
}
static std::string combinePath(const char *basePath, const char *relPath) {
if (relPath[0] == '/' || (relPath[0] && relPath[1] == ':')) // absolute path?
return relPath;
@ -96,156 +241,57 @@ static std::string combinePath(const char *basePath, const char *relPath) {
return std::string(basePath, lastSlash+1)+relPath;
}
bool Charset::load(const char *filename, bool disableCharLiterals) {
struct CharsetLoadData {
Charset *charset;
const char *filename;
bool disableCharLiterals;
FILE *file;
if (FILE *f = fopen(filename, "rb")) {
enum {
CLEAR,
TIGHT,
RANGE_BRACKET,
RANGE_START,
RANGE_SEPARATOR,
RANGE_END
} state = CLEAR;
std::string buffer;
std::vector<unicode_t> unicodeBuffer;
unicode_t rangeStart = 0;
for (int c = fgetc(f), start = true; c >= 0; start = false) {
switch (c) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // number
if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR))
goto FAIL;
buffer.push_back((char) c);
c = readWord(buffer, f);
{
int cp;
if (!parseInt(cp, buffer.c_str()))
goto FAIL;
switch (state) {
case CLEAR:
if (cp >= 0)
add((unicode_t) cp);
state = TIGHT;
break;
case RANGE_BRACKET:
rangeStart = (unicode_t) cp;
state = RANGE_START;
break;
case RANGE_SEPARATOR:
for (unicode_t u = rangeStart; (int) u <= cp; ++u)
add(u);
state = RANGE_END;
break;
default:;
}
}
buffer.clear();
continue; // next character already read
case '\'': // single UTF-8 character
if (!(state == CLEAR || state == RANGE_BRACKET || state == RANGE_SEPARATOR) || disableCharLiterals)
goto FAIL;
if (!readString(buffer, f, '\''))
goto FAIL;
utf8Decode(unicodeBuffer, buffer.c_str());
if (unicodeBuffer.size() == 1) {
switch (state) {
case CLEAR:
if (unicodeBuffer[0] > 0)
add(unicodeBuffer[0]);
state = TIGHT;
break;
case RANGE_BRACKET:
rangeStart = unicodeBuffer[0];
state = RANGE_START;
break;
case RANGE_SEPARATOR:
for (unicode_t u = rangeStart; u <= unicodeBuffer[0]; ++u)
add(u);
state = RANGE_END;
break;
default:;
}
} else
goto FAIL;
unicodeBuffer.clear();
buffer.clear();
break;
case '"': // string of UTF-8 characters
if (state != CLEAR || disableCharLiterals)
goto FAIL;
if (!readString(buffer, f, '"'))
goto FAIL;
utf8Decode(unicodeBuffer, buffer.c_str());
for (unicode_t cp : unicodeBuffer)
add(cp);
unicodeBuffer.clear();
buffer.clear();
state = TIGHT;
break;
case '[': // character range start
if (state != CLEAR)
goto FAIL;
state = RANGE_BRACKET;
break;
case ']': // character range end
if (state == RANGE_END)
state = TIGHT;
else
goto FAIL;
break;
case '@': // annotation
if (state != CLEAR)
goto FAIL;
c = readWord(buffer, f);
if (buffer == "include") {
while (c == ' ' || c == '\t' || c == '\n' || c == '\r')
c = fgetc(f);
if (c != '"')
goto FAIL;
buffer.clear();
if (!readString(buffer, f, '"'))
goto FAIL;
load(combinePath(filename, buffer.c_str()).c_str());
state = TIGHT;
} else
goto FAIL;
buffer.clear();
break;
case ',': case ';': // separator
if (!(state == CLEAR || state == TIGHT)) {
if (state == RANGE_START)
state = RANGE_SEPARATOR;
else
goto FAIL;
} // else treat as whitespace
// fallthrough
case ' ': case '\n': case '\r': case '\t': // whitespace
if (state == TIGHT)
state = CLEAR;
break;
case 0xef: // UTF-8 byte order mark
if (start) {
if (!(fgetc(f) == 0xbb && fgetc(f) == 0xbf))
goto FAIL;
break;
}
default: // unexpected character
goto FAIL;
}
c = fgetc(f);
}
fclose(f);
return state == CLEAR || state == TIGHT;
FAIL:
fclose(f);
return false;
static int readChar(void *userData) {
return fgetc(reinterpret_cast<CharsetLoadData *>(userData)->file);
}
static void add(void *userData, unicode_t cp) {
reinterpret_cast<CharsetLoadData *>(userData)->charset->add(cp);
}
static bool include(void *userData, const std::string &path) {
const CharsetLoadData &ud = *reinterpret_cast<CharsetLoadData *>(userData);
return ud.charset->load(combinePath(ud.filename, path.c_str()).c_str(), ud.disableCharLiterals);
}
};
bool Charset::load(const char *filename, bool disableCharLiterals) {
if (FILE *f = fopen(filename, "rb")) {
CharsetLoadData userData = { this, filename, disableCharLiterals, f };
bool success = charsetParse<CharsetLoadData::readChar, CharsetLoadData::add, CharsetLoadData::include>(&userData, disableCharLiterals, false);
fclose(f);
return success;
}
return false;
}
struct CharsetParseData {
Charset *charset;
const char *cur, *end;
static int readChar(void *userData) {
CharsetParseData &ud = *reinterpret_cast<CharsetParseData *>(userData);
return ud.cur < ud.end ? (int) (unsigned char) *ud.cur++ : -1;
}
static void add(void *userData, unicode_t cp) {
reinterpret_cast<CharsetParseData *>(userData)->charset->add(cp);
}
static bool include(void *, const std::string &) {
return false;
}
};
bool Charset::parse(const char *str, size_t strLength, bool disableCharLiterals) {
CharsetParseData userData = { this, str, str+strLength };
return charsetParse<CharsetParseData::readChar, CharsetParseData::add, CharsetParseData::include>(&userData, disableCharLiterals, true);
}
}

View File

@ -69,6 +69,10 @@ R"(
Specifies the input character set. Refer to the documentation for format of charset specification. Defaults to ASCII.
-glyphset <filename>
Specifies the set of input glyphs as glyph indices within the font file.
-chars <charset specification>
Specifies the input character set in-line. Refer to documentation for its syntax.
-glyphs <glyph set specification>
Specifies the set of glyph indices in-line. Refer to documentation for its syntax.
-allglyphs
Specifies that all glyphs within the font file are to be processed.
-fontscale <scale>
@ -290,6 +294,7 @@ struct FontInput {
bool variableFont;
GlyphIdentifierType glyphIdentifierType;
const char *charsetFilename;
const char *charsetString;
double fontScale;
const char *fontName;
};
@ -487,16 +492,31 @@ int main(int argc, const char *const *argv) {
#endif
ARG_CASE("-charset", 1) {
fontInput.charsetFilename = argv[argPos++];
fontInput.charsetString = nullptr;
fontInput.glyphIdentifierType = GlyphIdentifierType::UNICODE_CODEPOINT;
continue;
}
ARG_CASE("-glyphset", 1) {
fontInput.charsetFilename = argv[argPos++];
fontInput.charsetString = nullptr;
fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX;
continue;
}
ARG_CASE("-chars", 1) {
fontInput.charsetFilename = nullptr;
fontInput.charsetString = argv[argPos++];
fontInput.glyphIdentifierType = GlyphIdentifierType::UNICODE_CODEPOINT;
continue;
}
ARG_CASE("-glyphs", 1) {
fontInput.charsetFilename = nullptr;
fontInput.charsetString = argv[argPos++];
fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX;
continue;
}
ARG_CASE("-allglyphs", 0) {
fontInput.charsetFilename = nullptr;
fontInput.charsetString = nullptr;
fontInput.glyphIdentifierType = GlyphIdentifierType::GLYPH_INDEX;
continue;
}
@ -512,7 +532,7 @@ int main(int argc, const char *const *argv) {
continue;
}
ARG_CASE("-and", 0) {
if (!fontInput.fontFilename && !fontInput.charsetFilename && fontInput.fontScale < 0)
if (!fontInput.fontFilename && !fontInput.charsetFilename && !fontInput.charsetString && fontInput.fontScale < 0)
ABORT("No font, character set, or font scale specified before -and separator.");
if (!fontInputs.empty() && !memcmp(&fontInputs.back(), &fontInput, sizeof(FontInput)))
ABORT("No changes between subsequent inputs. A different font, character set, or font scale must be set inbetween -and separators.");
@ -926,8 +946,9 @@ int main(int argc, const char *const *argv) {
for (std::vector<FontInput>::reverse_iterator it = fontInputs.rbegin(); it != fontInputs.rend(); ++it) {
if (!it->fontFilename && nextFontInput->fontFilename)
it->fontFilename = nextFontInput->fontFilename;
if (!it->charsetFilename && nextFontInput->charsetFilename) {
if (!(it->charsetFilename || it->charsetString || it->glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX) && (nextFontInput->charsetFilename || nextFontInput->charsetString || nextFontInput->glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX)) {
it->charsetFilename = nextFontInput->charsetFilename;
it->charsetString = nextFontInput->charsetString;
it->glyphIdentifierType = nextFontInput->glyphIdentifierType;
}
if (it->fontScale < 0 && nextFontInput->fontScale >= 0)
@ -1100,6 +1121,9 @@ int main(int argc, const char *const *argv) {
if (fontInput.charsetFilename) {
if (!charset.load(fontInput.charsetFilename, fontInput.glyphIdentifierType != GlyphIdentifierType::UNICODE_CODEPOINT))
ABORT(fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX ? "Failed to load glyph set specification." : "Failed to load character set specification.");
} else if (fontInput.charsetString) {
if (!charset.parse(fontInput.charsetString, strlen(fontInput.charsetString), fontInput.glyphIdentifierType != GlyphIdentifierType::UNICODE_CODEPOINT))
ABORT(fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX ? "Failed to parse glyph set specification." : "Failed to parse character set specification.");
} else if (fontInput.glyphIdentifierType == GlyphIdentifierType::GLYPH_INDEX)
msdfgen::getGlyphCount(allGlyphCount, font);
else
@ -1411,7 +1435,7 @@ int main(int argc, const char *const *argv) {
}
} else {
result = 1;
fputs("Shadron preview not supported in -glyphset mode.\n", stderr);
fputs("Shadron preview not supported in glyph set mode.\n", stderr);
}
}