Get rid of the preprocess stage

Yay!!
This commit is contained in:
Vytautas Šaltenis 2016-11-10 21:49:58 +02:00
parent 22a3e5b744
commit 120bb2fae1
8 changed files with 134 additions and 159 deletions

190
block.go
View File

@ -35,10 +35,6 @@ var (
// Note: this function and many that it calls assume that
// the input buffer ends with a newline.
func (p *parser) block(data []byte) {
if len(data) == 0 || data[len(data)-1] != '\n' {
panic("block input is missing terminating newline")
}
// this is called recursively: enforce a maximum depth
if p.nesting >= p.maxNesting {
return
@ -130,7 +126,7 @@ func (p *parser) block(data []byte) {
if p.isHRule(data) {
p.addBlock(HorizontalRule, nil)
var i int
for i = 0; data[i] != '\n'; i++ {
for i = 0; i < len(data) && data[i] != '\n'; i++ {
}
data = data[i:]
continue
@ -215,10 +211,10 @@ func (p *parser) isPrefixHeader(data []byte) bool {
if p.flags&SpaceHeaders != 0 {
level := 0
for level < 6 && data[level] == '#' {
for level < 6 && level < len(data) && data[level] == '#' {
level++
}
if data[level] != ' ' {
if level == len(data) || data[level] != ' ' {
return false
}
}
@ -227,7 +223,7 @@ func (p *parser) isPrefixHeader(data []byte) bool {
func (p *parser) prefixHeader(data []byte) int {
level := 0
for level < 6 && data[level] == '#' {
for level < 6 && level < len(data) && data[level] == '#' {
level++
}
i := skipChar(data, level, ' ')
@ -276,7 +272,7 @@ func (p *parser) isUnderlinedHeader(data []byte) int {
if data[0] == '=' {
i := skipChar(data, 1, '=')
i = skipChar(data, i, ' ')
if data[i] == '\n' {
if i < len(data) && data[i] == '\n' {
return 1
}
return 0
@ -286,7 +282,7 @@ func (p *parser) isUnderlinedHeader(data []byte) int {
if data[0] == '-' {
i := skipChar(data, 1, '-')
i = skipChar(data, i, ' ')
if data[i] == '\n' {
if i < len(data) && data[i] == '\n' {
return 2
}
return 0
@ -444,6 +440,9 @@ func (p *parser) htmlComment(data []byte, doRender bool) int {
// HR, which is the only self-closing block tag considered
func (p *parser) htmlHr(data []byte, doRender bool) int {
if len(data) < 4 {
return 0
}
if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
return 0
}
@ -451,13 +450,11 @@ func (p *parser) htmlHr(data []byte, doRender bool) int {
// not an <hr> tag after all; at least not a valid one
return 0
}
i := 3
for data[i] != '>' && data[i] != '\n' {
for i < len(data) && data[i] != '>' && data[i] != '\n' {
i++
}
if data[i] == '>' {
if i < len(data) && data[i] == '>' {
i++
if j := p.isEmpty(data[i:]); j > 0 {
size := i + j
@ -472,13 +469,12 @@ func (p *parser) htmlHr(data []byte, doRender bool) int {
return size
}
}
return 0
}
func (p *parser) htmlFindTag(data []byte) (string, bool) {
i := 0
for isalnum(data[i]) {
for i < len(data) && isalnum(data[i]) {
i++
}
key := string(data[:i])
@ -535,7 +531,10 @@ func (*parser) isEmpty(data []byte) int {
return 0
}
}
return i + 1
if i < len(data) && data[i] == '\n' {
i++
}
return i
}
func (*parser) isHRule(data []byte) bool {
@ -554,7 +553,7 @@ func (*parser) isHRule(data []byte) bool {
// the whole line must be the char or whitespace
n := 0
for data[i] != '\n' {
for i < len(data) && data[i] != '\n' {
switch {
case data[i] == c:
n++
@ -570,8 +569,7 @@ func (*parser) isHRule(data []byte) bool {
// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
// and returns the end index if so, or 0 otherwise. It also returns the marker found.
// If syntax is not nil, it gets set to the syntax specified in the fence line.
// A final newline is mandatory to recognize the fence line, unless newlineOptional is true.
func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) {
func isFenceLine(data []byte, syntax *string, oldmarker string) (end int, marker string) {
i, size := 0, 0
// skip up to three spaces
@ -613,7 +611,7 @@ func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional
i = skipChar(data, i, ' ')
if i >= len(data) {
if newlineOptional && i == len(data) {
if i == len(data) {
return i, marker
}
return 0, ""
@ -658,12 +656,11 @@ func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional
i = skipChar(data, i, ' ')
if i >= len(data) || data[i] != '\n' {
if newlineOptional && i == len(data) {
if i == len(data) {
return i, marker
}
return 0, ""
}
return i + 1, marker // Take newline into account.
}
@ -672,7 +669,7 @@ func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional
// If doRender is true, a final newline is mandatory to recognize the fenced code block.
func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
var syntax string
beg, marker := isFenceLine(data, &syntax, "", false)
beg, marker := isFenceLine(data, &syntax, "")
if beg == 0 || beg >= len(data) {
return 0
}
@ -685,8 +682,7 @@ func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
// safe to assume beg < len(data)
// check for the end of the code block
newlineOptional := !doRender
fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional)
fenceEnd, _ := isFenceLine(data[beg:], nil, marker)
if fenceEnd != 0 {
beg += fenceEnd
break
@ -756,7 +752,7 @@ func (p *parser) table(data []byte) int {
for i < len(data) {
pipes, rowStart := 0, i
for ; data[i] != '\n'; i++ {
for ; i < len(data) && data[i] != '\n'; i++ {
if data[i] == '|' {
pipes++
}
@ -768,7 +764,9 @@ func (p *parser) table(data []byte) int {
}
// include the newline in data sent to tableRow
i++
if i < len(data)-1 && data[i] == '\n' {
i++
}
p.tableRow(data[rowStart:i], columns, false)
}
@ -787,7 +785,7 @@ func isBackslashEscaped(data []byte, i int) bool {
func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
i := 0
colCount := 1
for i = 0; data[i] != '\n'; i++ {
for i = 0; i < len(data) && data[i] != '\n'; i++ {
if data[i] == '|' && !isBackslashEscaped(data, i) {
colCount++
}
@ -799,7 +797,11 @@ func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
}
// include the newline in the data sent to tableRow
header := data[:i+1]
j := i
if j < len(data) && data[j] == '\n' {
j++
}
header := data[:j]
// column count ignores pipes at beginning or end of line
if data[0] == '|' {
@ -825,7 +827,7 @@ func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
// each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
// and trailing | optional on last column
col := 0
for data[i] != '\n' {
for i < len(data) && data[i] != '\n' {
dashes := 0
if data[i] == ':' {
@ -833,19 +835,21 @@ func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
columns[col] |= TableAlignmentLeft
dashes++
}
for data[i] == '-' {
for i < len(data) && data[i] == '-' {
i++
dashes++
}
if data[i] == ':' {
if i < len(data) && data[i] == ':' {
i++
columns[col] |= TableAlignmentRight
dashes++
}
for data[i] == ' ' {
for i < len(data) && data[i] == ' ' {
i++
}
if i == len(data) {
return
}
// end of column test is messy
switch {
case dashes < 3:
@ -856,12 +860,12 @@ func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
// marker found, now skip past trailing whitespace
col++
i++
for data[i] == ' ' {
for i < len(data) && data[i] == ' ' {
i++
}
// trailing junk found after last column
if col >= colCount && data[i] != '\n' {
if col >= colCount && i < len(data) && data[i] != '\n' {
return
}
@ -884,7 +888,10 @@ func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
p.addBlock(TableHead, nil)
p.tableRow(header, columns, true)
size = i + 1
size = i
if size < len(data) && data[size] == '\n' {
size++
}
return
}
@ -897,13 +904,13 @@ func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
}
for col = 0; col < len(columns) && i < len(data); col++ {
for data[i] == ' ' {
for i < len(data) && data[i] == ' ' {
i++
}
cellStart := i
for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
for i < len(data) && (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
i++
}
@ -912,7 +919,7 @@ func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
// skip the end-of-cell marker, possibly taking us past end of buffer
i++
for cellEnd > cellStart && data[cellEnd-1] == ' ' {
for cellEnd > cellStart && cellEnd-1 < len(data) && data[cellEnd-1] == ' ' {
cellEnd--
}
@ -934,11 +941,11 @@ func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
// returns blockquote prefix length
func (p *parser) quotePrefix(data []byte) int {
i := 0
for i < 3 && data[i] == ' ' {
for i < 3 && i < len(data) && data[i] == ' ' {
i++
}
if data[i] == '>' {
if data[i+1] == ' ' {
if i < len(data) && data[i] == '>' {
if i < len(data)-1 && data[i+1] == ' ' {
return i + 2
}
return i + 1
@ -968,7 +975,7 @@ func (p *parser) quote(data []byte) int {
// Step over whole lines, collecting them. While doing that, check for
// fenced code and if one's found, incorporate it altogether,
// irregardless of any contents inside it
for data[end] != '\n' {
for end < len(data) && data[end] != '\n' {
if p.flags&FencedCode != 0 {
if i := p.fencedCodeBlock(data[end:], false); i > 0 {
// -1 to compensate for the extra end++ after the loop:
@ -978,7 +985,9 @@ func (p *parser) quote(data []byte) int {
}
end++
}
end++
if end < len(data) && data[end] == '\n' {
end++
}
if pre := p.quotePrefix(data[beg:]); pre > 0 {
// skip the prefix
beg += pre
@ -996,7 +1005,10 @@ func (p *parser) quote(data []byte) int {
// returns prefix length for block code
func (p *parser) codePrefix(data []byte) int {
if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
if data[0] == '\t' {
return 1
}
if len(data) >= 4 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
return 4
}
return 0
@ -1008,10 +1020,12 @@ func (p *parser) code(data []byte) int {
i := 0
for i < len(data) {
beg := i
for data[i] != '\n' {
for i < len(data) && data[i] != '\n' {
i++
}
if i < len(data) && data[i] == '\n' {
i++
}
i++
blankline := p.isEmpty(data[beg:i]) > 0
if pre := p.codePrefix(data[beg:i]); pre > 0 {
@ -1022,7 +1036,7 @@ func (p *parser) code(data []byte) int {
break
}
// verbatim copy to the working buffeu
// verbatim copy to the working buffer
if blankline {
work.WriteByte('\n')
} else {
@ -1052,15 +1066,16 @@ func (p *parser) code(data []byte) int {
// returns unordered list item prefix
func (p *parser) uliPrefix(data []byte) int {
i := 0
// start with up to 3 spaces
for i < 3 && data[i] == ' ' {
for i < len(data) && i < 3 && data[i] == ' ' {
i++
}
if i >= len(data)-1 {
return 0
}
// need a *, +, or - followed by a space
if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
data[i+1] != ' ' {
(data[i+1] != ' ' && data[i+1] != '\t') {
return 0
}
return i + 2
@ -1071,18 +1086,21 @@ func (p *parser) oliPrefix(data []byte) int {
i := 0
// start with up to 3 spaces
for i < 3 && data[i] == ' ' {
for i < 3 && i < len(data) && data[i] == ' ' {
i++
}
// count the digits
start := i
for data[i] >= '0' && data[i] <= '9' {
for i < len(data) && data[i] >= '0' && data[i] <= '9' {
i++
}
if start == i || i >= len(data)-1 {
return 0
}
// we need >= 1 digits followed by a dot and a space
if start == i || data[i] != '.' || data[i+1] != ' ' {
if data[i] != '.' || !(data[i+1] == ' ' || data[i+1] == '\t') {
return 0
}
return i + 2
@ -1090,13 +1108,15 @@ func (p *parser) oliPrefix(data []byte) int {
// returns definition list item prefix
func (p *parser) dliPrefix(data []byte) int {
i := 0
// need a : followed by a spaces
if data[i] != ':' || data[i+1] != ' ' {
if len(data) < 2 {
return 0
}
for data[i] == ' ' {
i := 0
// need a : followed by a spaces
if data[i] != ':' || !(data[i+1] == ' ' || data[i+1] == '\t') {
return 0
}
for i < len(data) && data[i] == ' ' {
i++
}
return i + 2
@ -1174,6 +1194,9 @@ func finalizeList(block *Node) {
func (p *parser) listItem(data []byte, flags *ListType) int {
// keep track of the indentation of the first line
itemIndent := 0
if data[itemIndent] == '\t' {
itemIndent += 4
}
for itemIndent < 3 && data[itemIndent] == ' ' {
itemIndent++
}
@ -1202,13 +1225,13 @@ func (p *parser) listItem(data []byte, flags *ListType) int {
}
// skip leading whitespace on first line
for data[i] == ' ' {
for i < len(data) && data[i] == ' ' {
i++
}
// find the end of the line
line := i
for i > 0 && data[i-1] != '\n' {
for i > 0 && i < len(data) && data[i-1] != '\n' {
i++
}
@ -1228,7 +1251,7 @@ gatherlines:
i++
// find the end of this line
for data[i-1] != '\n' {
for i < len(data) && data[i-1] != '\n' {
i++
}
@ -1242,11 +1265,18 @@ gatherlines:
// calculate the indentation
indent := 0
for indent < 4 && line+indent < i && data[line+indent] == ' ' {
indent++
indentIndex := 0
if data[line] == '\t' {
indentIndex++
indent += 4
} else {
for indent < 4 && line+indent < i && data[line+indent] == ' ' {
indent++
indentIndex++
}
}
chunk := data[line+indent : i]
chunk := data[line+indentIndex : i]
// evaluate how this line fits in
switch {
@ -1287,7 +1317,7 @@ gatherlines:
if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
// is the next item still a part of this list?
next := i
for data[next] != '\n' {
for next < len(data) && data[next] != '\n' {
next++
}
for next < len(data)-1 && data[next] == '\n' {
@ -1315,7 +1345,7 @@ gatherlines:
}
// add the line into the working buffer without prefix
raw.Write(data[line+indent : i])
raw.Write(data[line+indentIndex : i])
line = i
}
@ -1363,8 +1393,11 @@ func (p *parser) renderParagraph(data []byte) {
beg++
}
end := len(data)
// trim trailing newline
end := len(data) - 1
if data[len(data)-1] == '\n' {
end--
}
// trim trailing spaces
for end > beg && data[end-1] == ' ' {
@ -1403,7 +1436,8 @@ func (p *parser) paragraph(data []byte) int {
// did this blank line followed by a definition list item?
if p.flags&DefinitionLists != 0 {
if i < len(data)-1 && data[i+1] == ':' {
return p.list(data[prev:], ListTypeDefinition)
ret := p.list(data[prev:], ListTypeDefinition)
return ret
}
}
@ -1436,7 +1470,7 @@ func (p *parser) paragraph(data []byte) int {
block.HeaderID = id
// find the end of the underline
for data[i] != '\n' {
for i < len(data) && data[i] != '\n' {
i++
}
return i
@ -1469,7 +1503,8 @@ func (p *parser) paragraph(data []byte) int {
// if there's a definition list item, prev line is a definition term
if p.flags&DefinitionLists != 0 {
if p.dliPrefix(current) != 0 {
return p.list(data[prev:], ListTypeDefinition)
ret := p.list(data[prev:], ListTypeDefinition)
return ret
}
}
@ -1485,7 +1520,12 @@ func (p *parser) paragraph(data []byte) int {
}
// otherwise, scan to the beginning of the next line
i += bytes.IndexByte(data[i:], '\n') + 1
nl := bytes.IndexByte(data[i:], '\n')
if nl >= 0 {
i += nl + 1
} else {
i += len(data[i:])
}
}
p.renderParagraph(data[:i])

View File

@ -1655,14 +1655,14 @@ func TestIsFenceLine(t *testing.T) {
tests := []struct {
data []byte
syntaxRequested bool
newlineOptional bool
wantEnd int
wantMarker string
wantSyntax string
}{
{
data: []byte("```"),
wantEnd: 0,
data: []byte("```"),
wantEnd: 3,
wantMarker: "```",
},
{
data: []byte("```\nstuff here\n"),
@ -1679,23 +1679,15 @@ func TestIsFenceLine(t *testing.T) {
data: []byte("stuff here\n```\n"),
wantEnd: 0,
},
{
data: []byte("```"),
newlineOptional: true,
wantEnd: 3,
wantMarker: "```",
},
{
data: []byte("```"),
syntaxRequested: true,
newlineOptional: true,
wantEnd: 3,
wantMarker: "```",
},
{
data: []byte("``` go"),
syntaxRequested: true,
newlineOptional: true,
wantEnd: 6,
wantMarker: "```",
wantSyntax: "go",
@ -1707,7 +1699,7 @@ func TestIsFenceLine(t *testing.T) {
if test.syntaxRequested {
syntax = new(string)
}
end, marker := isFenceLine(test.data, syntax, "```", test.newlineOptional)
end, marker := isFenceLine(test.data, syntax, "```")
if got, want := end, test.wantEnd; got != want {
t.Errorf("got end %v, want %v", got, want)
}

View File

@ -1142,7 +1142,7 @@ func TestUseXHTML(t *testing.T) {
func TestSkipHTML(t *testing.T) {
doTestsParam(t, []string{
"<div class=\"foo\"></div>\n\ntext\n\n<form>the form</form>",
"<p>text</p>\n",
"<p>text</p>\n\n<p>the form</p>\n",
"text <em>inline html</em> more text",
"<p>text inline html more text</p>\n",

View File

@ -387,7 +387,7 @@ func Parse(input []byte, opts Options) *Node {
p.notes = make([]*reference, 0)
}
p.block(preprocess(p, input))
p.block(input)
// Walk the tree and finish up some of unfinished blocks
for p.tip != nil {
p.finalize(p.tip)
@ -442,63 +442,6 @@ func (p *parser) parseRefsToAST() {
})
}
// preprocess does a preparatory first pass over the input:
// - normalize newlines
// - expand tabs (outside of fenced code blocks)
// - copy everything else
func preprocess(p *parser, input []byte) []byte {
var out bytes.Buffer
tabSize := TabSizeDefault
if p.flags&TabSizeEight != 0 {
tabSize = TabSizeDouble
}
beg := 0
lastFencedCodeBlockEnd := 0
for beg < len(input) {
// Find end of this line, then process the line.
end := beg
for end < len(input) && input[end] != '\n' && input[end] != '\r' {
end++
}
if p.flags&FencedCode != 0 {
// track fenced code block boundaries to suppress tab expansion
// and reference extraction inside them:
if beg >= lastFencedCodeBlockEnd {
if i := p.fencedCodeBlock(input[beg:], false); i > 0 {
lastFencedCodeBlockEnd = beg + i
}
}
}
// add the line body if present
if end > beg {
if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
out.Write(input[beg:end])
} else {
expandTabs(&out, input[beg:end], tabSize)
}
}
if end < len(input) && input[end] == '\r' {
end++
}
if end < len(input) && input[end] == '\n' {
end++
}
out.WriteByte('\n')
beg = end
}
// empty input?
if out.Len() == 0 {
out.WriteByte('\n')
}
return out.Bytes()
}
//
// Link references
//

View File

@ -1,13 +1,13 @@
<p>Here's a simple block:</p>
<div>
foo
foo
</div>
<p>This should be a code block, though:</p>
<pre><code>&lt;div&gt;
foo
foo
&lt;/div&gt;
</code></pre>
@ -19,11 +19,11 @@
<p>Now, nested:</p>
<div>
<div>
<div>
foo
</div>
</div>
<div>
<div>
foo
</div>
</div>
</div>
<p>This should just be an HTML comment:</p>

View File

@ -3,7 +3,7 @@
<!-- This is a simple comment -->
<!--
This is another comment.
This is another comment.
-->
<p>Paragraph two.</p>

View File

@ -939,8 +939,8 @@ _ underscore
[] square brackets
() parentheses
# hash mark
+ plus sign
- minus sign (hyphen)
+ plus sign
- minus sign (hyphen)
. dot
! exclamation mark
</code></pre>

6
testdata/Tabs.html vendored
View File

@ -13,13 +13,13 @@ indented with spaces</p></li>
<p>And:</p>
<pre><code> this code block is indented by two tabs
<pre><code> this code block is indented by two tabs
</code></pre>
<p>And:</p>
<pre><code>+ this is an example list item
indented with tabs
<pre><code>+ this is an example list item
indented with tabs
+ this is an example list item
indented with spaces