parser/pageparser: Preserve non-ASCII whitespace after e.g. summary divider

bep · bep · commit 322e59904dd8 · 2026-06-17T11:56:44.000+02:00
Make it insted consume just ASCII whitespace, which preserves e.g. ideographic space (U+3000) after the summary divider, which is important for e.g. Chinese and Japanese content, and possibly other Unicode whitespace characters with meaning.

Doing this is possibly breaking, but not likely, and obviously the correct thing to do.
diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go
@@ -248,7 +248,7 @@ func (l *pageLexer) consumeCRLF() bool {
 func (l *pageLexer) consumeToSpace() {
 	for {
 		r := l.next()
-		if r == eof || unicode.IsSpace(r) {
+		if r == eof || isASCIISpace(r) {
 			l.backup()
 			return
 		}
@@ -258,7 +258,7 @@ func (l *pageLexer) consumeToSpace() {
 func (l *pageLexer) consumeSpace() {
 	for {
 		r := l.next()
-		if r == eof || !unicode.IsSpace(r) {
+		if r == eof || !isASCIISpace(r) {
 			l.backup()
 			return
 		}
@@ -497,7 +497,7 @@ func minIndex(indices ...int) int {
 
 func indexNonWhiteSpace(s []byte, in rune) int {
 	idx := bytes.IndexFunc(s, func(r rune) bool {
-		return !unicode.IsSpace(r)
+		return !isASCIISpace(r)
 	})
 
 	if idx == -1 {
@@ -511,8 +511,12 @@ func indexNonWhiteSpace(s []byte, in rune) int {
 	return -1
 }
 
-func isSpace(r rune) bool {
-	return r == ' ' || r == '\t'
+func isASCIISpace(r rune) bool {
+	switch r {
+	case '\t', '\n', '\v', '\f', '\r', ' ':
+		return true
+	}
+	return false
 }
 
 func isAlphaNumericOrHyphen(r rune) bool {
diff --git a/parser/pageparser/pagelexer_intro.go b/parser/pageparser/pagelexer_intro.go
@@ -32,7 +32,7 @@ LOOP:
 			return lexFrontMatterOrgMode
 		case r == byteOrderMark:
 			l.emit(TypeIgnore)
-		case !isSpace(r) && !isEndOfLine(r):
+		case !isASCIISpace(r) && !isEndOfLine(r):
 			break LOOP
 		}
 	}
diff --git a/parser/pageparser/pagelexer_shortcode.go b/parser/pageparser/pagelexer_shortcode.go
@@ -290,7 +290,7 @@ func lexEndOfShortcode(l *pageLexer) stateFunc {
 		return lexShortcodeRightDelim
 	}
 	switch r := l.next(); {
-	case isSpace(r):
+	case isASCIISpace(r):
 		l.ignore()
 	default:
 		return l.errorf("unclosed shortcode")
@@ -307,7 +307,7 @@ func lexInsideShortcode(l *pageLexer) stateFunc {
 	case r == eof:
 		// eol is allowed inside shortcodes; this may go to end of document before it fails
 		return l.errorf("unclosed shortcode action")
-	case isSpace(r), isEndOfLine(r):
+	case isASCIISpace(r):
 		l.ignore()
 	case r == '=':
 		l.consumeSpace()
diff --git a/parser/pageparser/pageparser_test.go b/parser/pageparser/pageparser_test.go
@@ -138,3 +138,13 @@ func TestSummaryDividerStartingFromMain(t *testing.T) {
 	c.Assert(items, qt.HasLen, 4)
 	c.Assert(items[1].Type, qt.Equals, TypeLeadSummaryDivider)
 }
+
+func TestIdeographicAfterSummaryDivider(t *testing.T) {
+	c := qt.New(t)
+
+	input := []byte(`aaa <!--more-->   　bbb`)
+	items, err := collectStringMain(string(input))
+	c.Assert(err, qt.IsNil)
+	c.Assert(items, qt.HasLen, 4)
+	c.Assert(items[2].ValStr(input), qt.Equals, "\u3000bbb")
+}

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ LOOP:`
`32`	`32`	`return lexFrontMatterOrgMode`
`33`	`33`	`case r == byteOrderMark:`
`34`	`34`	`l.emit(TypeIgnore)`
`35`		`- case !isSpace(r) && !isEndOfLine(r):`
	`35`	`+ case !isASCIISpace(r) && !isEndOfLine(r):`
`36`	`36`	`break LOOP`
`37`	`37`	`}`
`38`	`38`	`}`