From 3ff246410766f450562a455f61fcb414042a92b0 Mon Sep 17 00:00:00 2001 From: avdoseferovic Date: Fri, 16 Jan 2026 16:24:12 +0100 Subject: [PATCH 1/2] Refactor: Support emojis and high unicode characters Changes: - Change fontDefType.Cw and utf8FontFile.CharWidths from slice to map[int]int to support sparse and high unicode characters (fixing crash). - Update utf8toutf16 to correctly handle 4-byte UTF-8 sequences using surrogate pairs. - Add UnmarshalJSON to fontDefType to backward-compatibility with array-based font definitions. - Remove hardcoded limit checks for character widths. --- def.go | 40 ++++++++++++++++++++++++++++++++++++++- font.go | 8 +++++++- fpdf.go | 50 ++++++++++++++++++++++++++++++++----------------- splittext.go | 2 +- utf8fontfile.go | 20 +++++++------------- util.go | 33 ++++++++++---------------------- 6 files changed, 97 insertions(+), 56 deletions(-) diff --git a/def.go b/def.go index 6a7030f9..9a5f16c1 100644 --- a/def.go +++ b/def.go @@ -23,6 +23,7 @@ import ( "encoding/json" "fmt" "io" + "strconv" "time" ) @@ -702,7 +703,7 @@ type fontDefType struct { Desc FontDescType // Font descriptor Up int // Underline position Ut int // Underline thickness - Cw []int // Character width by ordinal + Cw map[int]int // Character width by ordinal Enc string // "cp1252", ... Diff string // Differences from reference encoding File string // "Redressed.z" @@ -715,6 +716,43 @@ type fontDefType struct { usedRunes map[int]int // Array of used runes } +// UnmarshalJSON handles both array (legacy) and map (new) formats for Cw +func (f *fontDefType) UnmarshalJSON(data []byte) error { + type Alias fontDefType + aux := &struct { + Cw interface{} + *Alias + }{ + Alias: (*Alias)(f), + } + if err := json.Unmarshal(data, &aux); err != nil { + return err + } + + f.Cw = make(map[int]int) + if aux.Cw != nil { + switch v := aux.Cw.(type) { + case []interface{}: + for i, val := range v { + if fVal, ok := val.(float64); ok { + if fVal != 0 { + f.Cw[i] = int(fVal) + } + } + } + case map[string]interface{}: + for k, val := range v { + if fVal, ok := val.(float64); ok { + if i, err := strconv.Atoi(k); err == nil { + f.Cw[i] = int(fVal) + } + } + } + } + } + return nil +} + // generateFontID generates a font Id from the font definition func generateFontID(fdt fontDefType) (string, error) { // file can be different if generated in different instance diff --git a/font.go b/font.go index 29417bb0..a309c764 100644 --- a/font.go +++ b/font.go @@ -344,7 +344,13 @@ func makeDefinitionFile(fileStr, tpStr, encodingFileStr string, embed bool, encL // dump(def.Desc.FontBBox) def.Up = info.UnderlinePosition def.Ut = info.UnderlineThickness - def.Cw = info.Widths + // def.Cw = info.Widths + def.Cw = make(map[int]int) + for i, w := range info.Widths { + if w != 0 { + def.Cw[i] = w + } + } def.Enc = baseNoExt(encodingFileStr) // fmt.Printf("encodingFileStr [%s], def.Enc [%s]\n", encodingFileStr, def.Enc) // fmt.Printf("reference [%s]\n", filepath.Join(filepath.Dir(encodingFileStr), "cp1252.map")) diff --git a/fpdf.go b/fpdf.go index 3be1cdc2..7a79f241 100644 --- a/fpdf.go +++ b/fpdf.go @@ -963,9 +963,10 @@ func (f *Fpdf) GetStringSymbolWidth(s string) int { unicode := []rune(s) for _, char := range unicode { intChar := int(char) - if len(f.currentFont.Cw) >= intChar && f.currentFont.Cw[intChar] > 0 { - if f.currentFont.Cw[intChar] != 65535 { - w += f.currentFont.Cw[intChar] + width, ok := f.currentFont.Cw[intChar] + if ok && width > 0 { + if width != 65535 { + w += width } } else if f.currentFont.Desc.MissingWidth != 0 { w += f.currentFont.Desc.MissingWidth @@ -974,11 +975,22 @@ func (f *Fpdf) GetStringSymbolWidth(s string) int { } } } else { - for _, ch := range []byte(s) { - if ch == 0 { + for _, char := range []byte(s) { + if char == 0 { break } - w += f.currentFont.Cw[ch] + ch := int(char) + if width, ok := f.currentFont.Cw[ch]; ok { + w += width + } else { + // Default behavior for non-existent char in map (should ideally not happen for byte fonts if properly initialized) + // Or assume missing width + if f.currentFont.Desc.MissingWidth != 0 { + w += f.currentFont.Desc.MissingWidth + } else { + w += 500 + } + } } } return w @@ -2545,7 +2557,7 @@ func (f *Fpdf) SplitLines(txt []byte, w float64) [][]byte { l := 0 for i < nb { c := s[i] - l += cw[c] + l += cw[int(c)] if c == ' ' || c == '\t' || c == '\n' { sep = i } @@ -2709,14 +2721,14 @@ func (f *Fpdf) MultiCell(w, h float64, txtStr, borderStr, alignStr string, fill ls = l ns++ } - if int(c) >= len(cw) { - f.err = fmt.Errorf("character outside the supported range: %s", string(c)) - return - } - if cw[int(c)] == 0 { //Marker width 0 used for missing symbols + // if int(c) >= len(cw) { + // f.err = fmt.Errorf("character outside the supported range: %s", string(c)) + // return + // } + if width, ok := cw[int(c)]; !ok || width == 0 { //Marker width 0 used for missing symbols l += f.currentFont.Desc.MissingWidth - } else if cw[int(c)] != 65535 { //Marker width 65535 used for zero width symbols - l += cw[int(c)] + } else if width != 65535 { //Marker width 65535 used for zero width symbols + l += width } if l > wmax { // Automatic line break @@ -4079,7 +4091,11 @@ func (f *Fpdf) putfonts() { var s fmtBuffer s.WriteString("[") for j := 32; j < 256; j++ { - s.printf("%d ", font.Cw[j]) + if width, ok := font.Cw[j]; ok { + s.printf("%d ", width) + } else { + s.WriteString("0 ") + } } s.WriteString("]") f.out(s.String()) @@ -4200,10 +4216,10 @@ func (f *Fpdf) generateCIDFontMap(font *fontDefType, LastRune int) { // for each character for cid := startCid; cid < cwLen; cid++ { - if font.Cw[cid] == 0x00 { + width, ok := font.Cw[cid] + if !ok || width == 0x00 { continue } - width := font.Cw[cid] if width == 65535 { width = 0 } diff --git a/splittext.go b/splittext.go index 525f93b0..b203e298 100644 --- a/splittext.go +++ b/splittext.go @@ -25,7 +25,7 @@ func (f *Fpdf) SplitText(txt string, w float64) (lines []string) { l := 0 for i < nb { c := s[i] - l += cw[c] + l += cw[int(c)] if unicode.IsSpace(c) || isChinese(c) { sep = i } diff --git a/utf8fontfile.go b/utf8fontfile.go index 0e1a17a7..f0f96df2 100644 --- a/utf8fontfile.go +++ b/utf8fontfile.go @@ -51,7 +51,7 @@ type utf8FontFile struct { Flags int UnderlinePosition float64 UnderlineThickness float64 - CharWidths []int + CharWidths map[int]int DefaultWidth float64 symbolData map[int]map[string][]int CodeSymbolDictionary map[int]int @@ -836,7 +836,7 @@ func (utf *utf8FontFile) parseHMTXTable(numberOfHMetrics, numSymbols int, symbol start := utf.SeekTable("hmtx") arrayWidths := 0 var arr []int - utf.CharWidths = make([]int, 256*256) + utf.CharWidths = make(map[int]int) charCount := 0 arr = unpackUint16Array(utf.getRange(start, numberOfHMetrics*4)) for symbol := 0; symbol < numberOfHMetrics; symbol++ { @@ -856,10 +856,8 @@ func (utf *utf8FontFile) parseHMTXTable(numberOfHMetrics, numSymbols int, symbol if widths == 0 { widths = 65535 } - if char < 196608 { - utf.CharWidths[char] = widths - charCount++ - } + utf.CharWidths[char] = widths + charCount++ } } } @@ -874,10 +872,8 @@ func (utf *utf8FontFile) parseHMTXTable(numberOfHMetrics, numSymbols int, symbol if widths == 0 { widths = 65535 } - if char < 196608 { - utf.CharWidths[char] = widths - charCount++ - } + utf.CharWidths[char] = widths + charCount++ } } } @@ -967,9 +963,7 @@ func (utf *utf8FontFile) generateSCCSDictionaries(runeCmapPosition int, symbolCh } } charSymbolDictionary[char] = symbol - if char < 196608 { - maxRune = max(char, maxRune) - } + maxRune = max(char, maxRune) symbolCharDictionary[symbol] = append(symbolCharDictionary[symbol], char) } } diff --git a/util.go b/util.go index 351d3192..ea5e1c4c 100644 --- a/util.go +++ b/util.go @@ -115,29 +115,16 @@ func utf8toutf16(s string, withBOM ...bool) string { if bom { res = append(res, 0xFE, 0xFF) } - nb := len(s) - i := 0 - for i < nb { - c1 := byte(s[i]) - i++ - switch { - case c1 >= 224: - // 3-byte character - c2 := byte(s[i]) - i++ - c3 := byte(s[i]) - i++ - res = append(res, ((c1&0x0F)<<4)+((c2&0x3C)>>2), - ((c2&0x03)<<6)+(c3&0x3F)) - case c1 >= 192: - // 2-byte character - c2 := byte(s[i]) - i++ - res = append(res, ((c1 & 0x1C) >> 2), - ((c1&0x03)<<6)+(c2&0x3F)) - default: - // Single-byte character - res = append(res, 0, c1) + for _, r := range s { + if r < 0x10000 { + // BMP character + res = append(res, byte(r>>8), byte(r)) + } else { + // Supplementary character (needs surrogate pair) + r -= 0x10000 + high := 0xD800 | (r >> 10) + low := 0xDC00 | (r & 0x3FF) + res = append(res, byte(high>>8), byte(high), byte(low>>8), byte(low)) } } return string(res) From 5dfb7936551e0b744a1676149f38e7921f583976 Mon Sep 17 00:00:00 2001 From: avdoseferovic Date: Fri, 16 Jan 2026 16:38:48 +0100 Subject: [PATCH 2/2] Refactor: Phase 2 and 3 implementation for Emoji support Changes: - Implement CMAP Format 12 parsing in utf8fontfile.go. - Implement CID remapping in fpdf.go to support characters outside BMP (e.g. Emojis). - Add runeToCid map to fontDefType. - Add helper methods stringToCIDs and getOrAssignCID. - Update Text, CellFormat, and generateCIDFontMap to use CID remapping and correct width lookup. - Update parseSymbols to use CIDs as keys for GID lookup. --- def.go | 2 + fpdf.go | 109 ++++++++++++++++++++++++++++++++++++++++-------- utf8fontfile.go | 50 +++++++++++++++++----- 3 files changed, 133 insertions(+), 28 deletions(-) diff --git a/def.go b/def.go index 9a5f16c1..bb2cb16d 100644 --- a/def.go +++ b/def.go @@ -714,6 +714,8 @@ type fontDefType struct { i string // 1-based position in font list, set by font loader, not this program utf8File *utf8FontFile // UTF-8 font usedRunes map[int]int // Array of used runes + runeToCid map[int]int // Map of rune to CID (for remapping) + nextFreeCID int // Next available CID for remapping } // UnmarshalJSON handles both array (legacy) and map (new) formats for Cw diff --git a/fpdf.go b/fpdf.go index 7a79f241..d4b5b3ea 100644 --- a/fpdf.go +++ b/fpdf.go @@ -1745,6 +1745,10 @@ func (f *Fpdf) addFont(familyStr, styleStr, fileStr string, isUTF8 bool) { usedRunes: sbarr, File: fileStr, utf8File: utf8File, + runeToCid: make(map[int]int), + } + for cid, r := range sbarr { + def.runeToCid[r] = cid } def.i, _ = generateFontID(def) f.fonts[fontKey] = def @@ -1881,6 +1885,10 @@ func (f *Fpdf) addFontFromBytes(familyStr, styleStr string, jsonFileBytes, zFile Cw: utf8File.CharWidths, utf8File: utf8File, usedRunes: sbarr, + runeToCid: make(map[int]int), + } + for cid, r := range sbarr { + def.runeToCid[r] = cid } def.i, _ = generateFontID(def) f.fonts[fontkey] = def @@ -2213,6 +2221,61 @@ func (f *Fpdf) Bookmark(txtStr string, level int, y float64) { f.outlines = append(f.outlines, outlineType{text: txtStr, level: level, y: y, p: f.PageNo(), prev: -1, last: -1, next: -1, first: -1}) } +func (f *Fpdf) getOrAssignCID(r int) int { + if cid, ok := f.currentFont.runeToCid[r]; ok { + return cid + } + + cid := r + // If the rune is in BMP and not already used as a CID for another rune (identity mapping), use it. + // But we must check if 'cid' is already occupied by a different rune? + // If runeToCid is empty initially, and usedRunes is empty. + // We want to prefer Identity. + // Check if this CID slot is free in usedRunes. + // Note: usedRunes[cid] = original_rune + if r < 0xFFFF { + if original, used := f.currentFont.usedRunes[r]; !used || original == r { + cid = r + } else { + cid = f.findNextFreeCID() + } + } else { + cid = f.findNextFreeCID() + } + + f.currentFont.runeToCid[r] = cid + f.currentFont.usedRunes[cid] = r + return cid +} + +func (f *Fpdf) findNextFreeCID() int { + // Start searching from PUA + start := 0xE000 + for i := start; i < 0xFFFF; i++ { + if _, used := f.currentFont.usedRunes[i]; !used { + return i + } + } + // If PUA full, search from beginning? + for i := 32; i < 0xE000; i++ { + if _, used := f.currentFont.usedRunes[i]; !used { + return i + } + } + // Fallback to 0 if full (should panic?) + return 0 +} + +func (f *Fpdf) stringToCIDs(s string) string { + var b bytes.Buffer + for _, r := range s { + cid := f.getOrAssignCID(int(r)) + b.WriteByte(byte(cid >> 8)) + b.WriteByte(byte(cid)) + } + return b.String() +} + // Text prints a character string. The origin (x, y) is on the left of the // first character at the baseline. This method permits a string to be placed // precisely on the page, but it is usually easier to use Cell(), MultiCell() @@ -2224,10 +2287,7 @@ func (f *Fpdf) Text(x, y float64, txtStr string) { txtStr = reverseText(txtStr) x -= f.GetStringWidth(txtStr) } - txt2 = f.escape(utf8toutf16(txtStr, false)) - for _, uni := range []rune(txtStr) { - f.currentFont.usedRunes[int(uni)] = int(uni) - } + txt2 = f.escape(f.stringToCIDs(txtStr)) } else { txt2 = f.escape(txtStr) } @@ -2436,10 +2496,10 @@ func (f *Fpdf) CellFormat(w, h float64, txtStr, borderStr string, ln int, txtStr = reverseText(txtStr) } wmax := int(math.Ceil((w - 2*f.cMargin) * 1000 / f.fontSize)) - for _, uni := range []rune(txtStr) { - f.currentFont.usedRunes[int(uni)] = int(uni) - } - space := f.escape(utf8toutf16(" ", false)) + // for _, uni := range []rune(txtStr) { + // f.currentFont.usedRunes[int(uni)] = int(uni) + // } + space := f.escape(f.stringToCIDs(" ")) strSize := f.GetStringSymbolWidth(txtStr) s.printf("BT 0 Tw %.2f %.2f Td [", (f.x+dx)*k, (f.h-(f.y+.5*h+.3*f.fontSize))*k) t := strings.Split(txtStr, " ") @@ -2447,7 +2507,7 @@ func (f *Fpdf) CellFormat(w, h float64, txtStr, borderStr string, ln int, numt := len(t) for i := 0; i < numt; i++ { tx := t[i] - tx = "(" + f.escape(utf8toutf16(tx, false)) + ")" + tx = "(" + f.escape(f.stringToCIDs(tx)) + ")" s.printf("%s ", tx) if (i + 1) < numt { s.printf("%.3f(%s) ", -shift, space) @@ -2460,10 +2520,10 @@ func (f *Fpdf) CellFormat(w, h float64, txtStr, borderStr string, ln int, if f.isRTL { txtStr = reverseText(txtStr) } - txt2 = f.escape(utf8toutf16(txtStr, false)) - for _, uni := range []rune(txtStr) { - f.currentFont.usedRunes[int(uni)] = int(uni) - } + txt2 = f.escape(f.stringToCIDs(txtStr)) + // for _, uni := range []rune(txtStr) { + // f.currentFont.usedRunes[int(uni)] = int(uni) + // } } else { txt2 = strings.Replace(txtStr, "\\", "\\\\", -1) @@ -2848,7 +2908,12 @@ func (f *Fpdf) write(h float64, txtStr string, link int, linkStr string) { if c == ' ' { sep = i } - l += float64(cw[int(c)]) + // l += float64(cw[int(c)]) + if width, ok := cw[int(c)]; ok { + l += float64(width) + } else { + l += float64(f.currentFont.Desc.MissingWidth) + } if l > wmax { // Automatic line break if sep == -1 { @@ -4216,16 +4281,24 @@ func (f *Fpdf) generateCIDFontMap(font *fontDefType, LastRune int) { // for each character for cid := startCid; cid < cwLen; cid++ { - width, ok := font.Cw[cid] + runa, used := font.usedRunes[cid] + if cid > 255 && (!used || runa == 0) { + continue + } + if !used { + runa = cid + } + + width, ok := font.Cw[runa] if !ok || width == 0x00 { continue } if width == 65535 { width = 0 } - if numb, OK := font.usedRunes[cid]; cid > 255 && (!OK || numb == 0) { - continue - } + // if numb, OK := font.usedRunes[cid]; cid > 255 && (!OK || numb == 0) { + // continue + // } if cid == prevCid+1 { if width == prevWidth { diff --git a/utf8fontfile.go b/utf8fontfile.go index f0f96df2..c41ab897 100644 --- a/utf8fontfile.go +++ b/utf8fontfile.go @@ -456,14 +456,18 @@ func (utf *utf8FontFile) parseCMAPTable(format int) int { coded := utf.readUint16() position := utf.readUint32() oldReaderPosition := utf.fileReader.readerPosition - if (system == 3 && coded == 1) || system == 0 { // Microsoft, Unicode + // System 3: Windows + // Coded 1: Unicode BMP (UCS-2) + // Coded 10: Unicode Full (UCS-4) + if (system == 3 && (coded == 1 || coded == 10)) || system == 0 { format = utf.getUint16(cmapPosition + position) - if format == 4 { - if cidCMAPPosition == 0 { - cidCMAPPosition = cmapPosition + position - } + if format == 12 { + cidCMAPPosition = cmapPosition + position break } + if format == 4 { + cidCMAPPosition = cmapPosition + position + } } utf.seek(int(oldReaderPosition)) } @@ -504,12 +508,15 @@ func (utf *utf8FontFile) generateCMAP() map[int][]int { coder := utf.readUint16() position := utf.readUint32() oldPosition := utf.fileReader.readerPosition - if (system == 3 && coder == 1) || system == 0 { + if (system == 3 && (coder == 1 || coder == 10)) || system == 0 { format := utf.getUint16(cmapPosition + position) - if format == 4 { + if format == 12 { runeCmapPosition = cmapPosition + position break } + if format == 4 { + runeCmapPosition = cmapPosition + position + } } utf.seek(int(oldPosition)) } @@ -531,13 +538,13 @@ func (utf *utf8FontFile) generateCMAP() map[int][]int { func (utf *utf8FontFile) parseSymbols(usedRunes map[int]int) (map[int]int, map[int]int, map[int]int, []int) { symbolCollection := map[int]int{0: 0} charSymbolPairCollection := make(map[int]int) - for _, char := range usedRunes { + for cid, char := range usedRunes { if _, OK := utf.charSymbolDictionary[char]; OK { symbolCollection[utf.charSymbolDictionary[char]] = char - charSymbolPairCollection[char] = utf.charSymbolDictionary[char] + charSymbolPairCollection[cid] = utf.charSymbolDictionary[char] } - utf.LastRune = max(utf.LastRune, char) + utf.LastRune = max(utf.LastRune, cid) } begin := utf.tableDescriptions["glyf"].position @@ -918,6 +925,29 @@ func (utf *utf8FontFile) parseLOCATable(format, numSymbols int) { } func (utf *utf8FontFile) generateSCCSDictionaries(runeCmapPosition int, symbolCharDictionary map[int][]int, charSymbolDictionary map[int]int) { + utf.seek(runeCmapPosition) + format := utf.readUint16() + + if format == 12 { + utf.skip(2) // reserved + _ = utf.readUint32() // length + utf.skip(4) // language + nGroups := utf.readUint32() + + for i := 0; i < int(nGroups); i++ { + startCharCode := int(utf.readUint32()) + endCharCode := int(utf.readUint32()) + startGlyphID := int(utf.readUint32()) + + for char := startCharCode; char <= endCharCode; char++ { + symbol := startGlyphID + (char - startCharCode) + charSymbolDictionary[char] = symbol + symbolCharDictionary[symbol] = append(symbolCharDictionary[symbol], char) + } + } + return + } + maxRune := 0 utf.seek(runeCmapPosition + 2) size := utf.readUint16()