Skip to content

Commit 322e599

Browse files
committed
parser/pageparser: Preserve non-ASCII whitespace after e.g. summary divider
Make it insted consume just ASCII whitespace, which preserves e.g. ideographic space (U+3000) after the summary divider, which is important for e.g. Chinese and Japanese content, and possibly other Unicode whitespace characters with meaning. Doing this is possibly breaking, but not likely, and obviously the correct thing to do.
1 parent 96e06e1 commit 322e599

4 files changed

Lines changed: 22 additions & 8 deletions

File tree

parser/pageparser/pagelexer.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ func (l *pageLexer) consumeCRLF() bool {
248248
func (l *pageLexer) consumeToSpace() {
249249
for {
250250
r := l.next()
251-
if r == eof || unicode.IsSpace(r) {
251+
if r == eof || isASCIISpace(r) {
252252
l.backup()
253253
return
254254
}
@@ -258,7 +258,7 @@ func (l *pageLexer) consumeToSpace() {
258258
func (l *pageLexer) consumeSpace() {
259259
for {
260260
r := l.next()
261-
if r == eof || !unicode.IsSpace(r) {
261+
if r == eof || !isASCIISpace(r) {
262262
l.backup()
263263
return
264264
}
@@ -497,7 +497,7 @@ func minIndex(indices ...int) int {
497497

498498
func indexNonWhiteSpace(s []byte, in rune) int {
499499
idx := bytes.IndexFunc(s, func(r rune) bool {
500-
return !unicode.IsSpace(r)
500+
return !isASCIISpace(r)
501501
})
502502

503503
if idx == -1 {
@@ -511,8 +511,12 @@ func indexNonWhiteSpace(s []byte, in rune) int {
511511
return -1
512512
}
513513

514-
func isSpace(r rune) bool {
515-
return r == ' ' || r == '\t'
514+
func isASCIISpace(r rune) bool {
515+
switch r {
516+
case '\t', '\n', '\v', '\f', '\r', ' ':
517+
return true
518+
}
519+
return false
516520
}
517521

518522
func isAlphaNumericOrHyphen(r rune) bool {

parser/pageparser/pagelexer_intro.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ LOOP:
3232
return lexFrontMatterOrgMode
3333
case r == byteOrderMark:
3434
l.emit(TypeIgnore)
35-
case !isSpace(r) && !isEndOfLine(r):
35+
case !isASCIISpace(r) && !isEndOfLine(r):
3636
break LOOP
3737
}
3838
}

parser/pageparser/pagelexer_shortcode.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ func lexEndOfShortcode(l *pageLexer) stateFunc {
290290
return lexShortcodeRightDelim
291291
}
292292
switch r := l.next(); {
293-
case isSpace(r):
293+
case isASCIISpace(r):
294294
l.ignore()
295295
default:
296296
return l.errorf("unclosed shortcode")
@@ -307,7 +307,7 @@ func lexInsideShortcode(l *pageLexer) stateFunc {
307307
case r == eof:
308308
// eol is allowed inside shortcodes; this may go to end of document before it fails
309309
return l.errorf("unclosed shortcode action")
310-
case isSpace(r), isEndOfLine(r):
310+
case isASCIISpace(r):
311311
l.ignore()
312312
case r == '=':
313313
l.consumeSpace()

parser/pageparser/pageparser_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,13 @@ func TestSummaryDividerStartingFromMain(t *testing.T) {
138138
c.Assert(items, qt.HasLen, 4)
139139
c.Assert(items[1].Type, qt.Equals, TypeLeadSummaryDivider)
140140
}
141+
142+
func TestIdeographicAfterSummaryDivider(t *testing.T) {
143+
c := qt.New(t)
144+
145+
input := []byte(`aaa <!--more-->  bbb`)
146+
items, err := collectStringMain(string(input))
147+
c.Assert(err, qt.IsNil)
148+
c.Assert(items, qt.HasLen, 4)
149+
c.Assert(items[2].ValStr(input), qt.Equals, "\u3000bbb")
150+
}

0 commit comments

Comments
 (0)