Construct AST WIP: add block-level nodes

Build a partial tree by adding block nodes. The block nodes will then be
traversed and inline markdown parsed inside each of them. Tests are
broken at this point until the full tree is constructed.
This commit is contained in:
Vytautas Šaltenis 2016-03-30 12:57:12 +03:00
parent 94893247d1
commit 7c95b7a189
2 changed files with 232 additions and 102 deletions

293
block.go
View File

@ -15,10 +15,23 @@ package blackfriday
import ( import (
"bytes" "bytes"
"html"
"regexp"
"github.com/shurcooL/sanitized_anchor_name" "github.com/shurcooL/sanitized_anchor_name"
) )
const (
Entity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
Escapable = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
)
var (
reBackslashOrAmp = regexp.MustCompile("[\\&]")
reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + Escapable + "|" + Entity)
reTrailingWhitespace = regexp.MustCompile("(\n *)+$")
)
// Parse block-level data. // Parse block-level data.
// Note: this function and many that it calls assume that // Note: this function and many that it calls assume that
// the input buffer ends with a newline. // the input buffer ends with a newline.
@ -116,7 +129,7 @@ func (p *parser) block(data []byte) {
// or // or
// ______ // ______
if p.isHRule(data) { if p.isHRule(data) {
p.r.HRule() p.addBlock(HorizontalRule, nil)
var i int var i int
for i = 0; data[i] != '\n'; i++ { for i = 0; data[i] != '\n'; i++ {
} }
@ -189,6 +202,13 @@ func (p *parser) block(data []byte) {
p.nesting-- p.nesting--
} }
func (p *parser) addBlock(typ NodeType, content []byte) *Node {
p.closeUnmatchedBlocks()
container := p.addChild(typ, 0)
container.content = content
return container
}
func (p *parser) isPrefixHeader(data []byte) bool { func (p *parser) isPrefixHeader(data []byte) bool {
if data[0] != '#' { if data[0] != '#' {
return false return false
@ -245,11 +265,9 @@ func (p *parser) prefixHeader(data []byte) int {
if id == "" && p.flags&AutoHeaderIDs != 0 { if id == "" && p.flags&AutoHeaderIDs != 0 {
id = sanitized_anchor_name.Create(string(data[i:end])) id = sanitized_anchor_name.Create(string(data[i:end]))
} }
p.r.BeginHeader(level, id) block := p.addBlock(Header, data[i:end])
header := p.r.CopyWrites(func() { block.HeaderID = id
p.inline(data[i:end]) block.Level = uint32(level)
})
p.r.EndHeader(level, id, header)
} }
return skip return skip
} }
@ -294,9 +312,14 @@ func (p *parser) titleBlock(data []byte, doRender bool) int {
} }
data = bytes.Join(splitData[0:i], []byte("\n")) data = bytes.Join(splitData[0:i], []byte("\n"))
p.r.TitleBlock(data) consumed := len(data)
data = bytes.TrimPrefix(data, []byte("% "))
data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
block := p.addBlock(Header, data)
block.Level = 1
block.IsTitleblock = true
return len(data) return consumed
} }
func (p *parser) html(data []byte, doRender bool) int { func (p *parser) html(data []byte, doRender bool) int {
@ -391,12 +414,17 @@ func (p *parser) html(data []byte, doRender bool) int {
for end > 0 && data[end-1] == '\n' { for end > 0 && data[end-1] == '\n' {
end-- end--
} }
p.r.BlockHtml(data[:end]) finalizeHtmlBlock(p.addBlock(HtmlBlock, data[:end]))
} }
return i return i
} }
func finalizeHtmlBlock(block *Node) {
block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{})
block.content = []byte{}
}
// HTML comment, lax form // HTML comment, lax form
func (p *parser) htmlComment(data []byte, doRender bool) int { func (p *parser) htmlComment(data []byte, doRender bool) int {
i := p.inlineHtmlComment(data) i := p.inlineHtmlComment(data)
@ -409,7 +437,8 @@ func (p *parser) htmlComment(data []byte, doRender bool) int {
for end > 0 && data[end-1] == '\n' { for end > 0 && data[end-1] == '\n' {
end-- end--
} }
p.r.BlockHtml(data[:end]) block := p.addBlock(HtmlBlock, data[:end])
finalizeHtmlBlock(block)
} }
return size return size
} }
@ -441,7 +470,7 @@ func (p *parser) htmlHr(data []byte, doRender bool) int {
for end > 0 && data[end-1] == '\n' { for end > 0 && data[end-1] == '\n' {
end-- end--
} }
p.r.BlockHtml(data[:end]) finalizeHtmlBlock(p.addBlock(HtmlBlock, data[:end]))
} }
return size return size
} }
@ -464,7 +493,9 @@ func (p *parser) htmlFindTag(data []byte) (string, bool) {
func (p *parser) htmlFindEnd(tag string, data []byte) int { func (p *parser) htmlFindEnd(tag string, data []byte) int {
// assume data[0] == '<' && data[1] == '/' already tested // assume data[0] == '<' && data[1] == '/' already tested
if tag == "hr" {
return 2
}
// check if tag is a match // check if tag is a match
closetag := []byte("</" + tag + ">") closetag := []byte("</" + tag + ">")
if !bytes.HasPrefix(data, closetag) { if !bytes.HasPrefix(data, closetag) {
@ -642,6 +673,10 @@ func (p *parser) fencedCode(data []byte, doRender bool) int {
} }
var work bytes.Buffer var work bytes.Buffer
if lang != nil {
work.Write([]byte(*lang))
work.WriteByte('\n')
}
for { for {
// safe to assume beg < len(data) // safe to assume beg < len(data)
@ -668,48 +703,76 @@ func (p *parser) fencedCode(data []byte, doRender bool) int {
beg = end beg = end
} }
syntax := "" //syntax := ""
if lang != nil { //if lang != nil {
syntax = *lang // syntax = *lang
} //}
if doRender { if doRender {
p.r.BlockCode(work.Bytes(), syntax) block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
block.IsFenced = true
finalizeCodeBlock(block)
} }
return beg return beg
} }
func unescapeChar(str []byte) []byte {
if str[0] == '\\' {
return []byte{str[1]}
}
return []byte(html.UnescapeString(string(str)))
}
func unescapeString(str []byte) []byte {
if reBackslashOrAmp.Match(str) {
return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
} else {
return str
}
}
func finalizeCodeBlock(block *Node) {
if block.IsFenced {
newlinePos := bytes.IndexByte(block.content, '\n')
firstLine := block.content[:newlinePos]
rest := block.content[newlinePos+1:]
block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
block.Literal = rest
} else {
block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{'\n'})
}
block.content = nil
}
func (p *parser) table(data []byte) int { func (p *parser) table(data []byte) int {
var header bytes.Buffer table := p.addBlock(Table, nil)
i, columns := p.tableHeader(&header, data) i, columns := p.tableHeader(data)
if i == 0 { if i == 0 {
p.tip = table.Parent
table.unlink()
return 0 return 0
} }
var body bytes.Buffer p.addBlock(TableBody, nil)
body.Write(p.r.CaptureWrites(func() { for i < len(data) {
for i < len(data) { pipes, rowStart := 0, i
pipes, rowStart := 0, i for ; data[i] != '\n'; i++ {
for ; data[i] != '\n'; i++ { if data[i] == '|' {
if data[i] == '|' { pipes++
pipes++
}
} }
if pipes == 0 {
i = rowStart
break
}
// include the newline in data sent to tableRow
i++
p.tableRow(data[rowStart:i], columns, false)
} }
}))
p.r.Table(header.Bytes(), body.Bytes(), columns) if pipes == 0 {
i = rowStart
break
}
// include the newline in data sent to tableRow
i++
p.tableRow(data[rowStart:i], columns, false)
}
return i return i
} }
@ -723,7 +786,7 @@ func isBackslashEscaped(data []byte, i int) bool {
return backslashes&1 == 1 return backslashes&1 == 1
} }
func (p *parser) tableHeader(out *bytes.Buffer, data []byte) (size int, columns []int) { func (p *parser) tableHeader(data []byte) (size int, columns []int) {
i := 0 i := 0
colCount := 1 colCount := 1
for i = 0; data[i] != '\n'; i++ { for i = 0; data[i] != '\n'; i++ {
@ -821,16 +884,15 @@ func (p *parser) tableHeader(out *bytes.Buffer, data []byte) (size int, columns
return return
} }
out.Write(p.r.CaptureWrites(func() { p.addBlock(TableHead, nil)
p.tableRow(header, columns, true) p.tableRow(header, columns, true)
}))
size = i + 1 size = i + 1
return return
} }
func (p *parser) tableRow(data []byte, columns []int, header bool) { func (p *parser) tableRow(data []byte, columns []int, header bool) {
p.addBlock(TableRow, nil)
i, col := 0, 0 i, col := 0, 0
var rowWork bytes.Buffer
if data[i] == '|' && !isBackslashEscaped(data, i) { if data[i] == '|' && !isBackslashEscaped(data, i) {
i++ i++
@ -856,29 +918,19 @@ func (p *parser) tableRow(data []byte, columns []int, header bool) {
cellEnd-- cellEnd--
} }
cellWork := p.r.CaptureWrites(func() { cell := p.addBlock(TableCell, data[cellStart:cellEnd])
p.inline(data[cellStart:cellEnd]) cell.IsHeader = header
}) cell.Align = columns[col]
if header {
p.r.TableHeaderCell(&rowWork, cellWork, columns[col])
} else {
p.r.TableCell(&rowWork, cellWork, columns[col])
}
} }
// pad it out with empty columns to get the right number // pad it out with empty columns to get the right number
for ; col < len(columns); col++ { for ; col < len(columns); col++ {
if header { cell := p.addBlock(TableCell, nil)
p.r.TableHeaderCell(&rowWork, nil, columns[col]) cell.IsHeader = header
} else { cell.Align = columns[col]
p.r.TableCell(&rowWork, nil, columns[col])
}
} }
// silently ignore rows with too many cells // silently ignore rows with too many cells
p.r.TableRow(rowWork.Bytes())
} }
// returns blockquote prefix length // returns blockquote prefix length
@ -910,6 +962,7 @@ func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
// parse a blockquote fragment // parse a blockquote fragment
func (p *parser) quote(data []byte) int { func (p *parser) quote(data []byte) int {
block := p.addBlock(BlockQuote, nil)
var raw bytes.Buffer var raw bytes.Buffer
beg, end := 0, 0 beg, end := 0, 0
for beg < len(data) { for beg < len(data) {
@ -928,22 +981,18 @@ func (p *parser) quote(data []byte) int {
end++ end++
} }
end++ end++
if pre := p.quotePrefix(data[beg:]); pre > 0 { if pre := p.quotePrefix(data[beg:]); pre > 0 {
// skip the prefix // skip the prefix
beg += pre beg += pre
} else if p.terminateBlockquote(data, beg, end) { } else if p.terminateBlockquote(data, beg, end) {
break break
} }
// this line is part of the blockquote // this line is part of the blockquote
raw.Write(data[beg:end]) raw.Write(data[beg:end])
beg = end beg = end
} }
p.block(raw.Bytes())
p.r.BlockQuote(p.r.CaptureWrites(func() { p.finalize(block)
p.block(raw.Bytes())
}))
return end return end
} }
@ -995,7 +1044,9 @@ func (p *parser) code(data []byte) int {
work.WriteByte('\n') work.WriteByte('\n')
p.r.BlockCode(work.Bytes(), "") block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
block.IsFenced = false
finalizeCodeBlock(block)
return i return i
} }
@ -1057,10 +1108,19 @@ func (p *parser) dliPrefix(data []byte) int {
func (p *parser) list(data []byte, flags ListType) int { func (p *parser) list(data []byte, flags ListType) int {
i := 0 i := 0
flags |= ListItemBeginningOfList flags |= ListItemBeginningOfList
p.r.BeginList(flags) block := p.addBlock(List, nil)
block.ListData = &ListData{ // TODO: fill in the real ListData
Flags: flags,
Tight: true,
BulletChar: '*',
Delimiter: 0,
}
for i < len(data) { for i < len(data) {
skip := p.listItem(data[i:], &flags) skip := p.listItem(data[i:], &flags)
if flags&ListItemContainsBlock != 0 {
block.ListData.Tight = false
}
i += skip i += skip
if skip == 0 || flags&ListItemEndOfList != 0 { if skip == 0 || flags&ListItemEndOfList != 0 {
break break
@ -1068,10 +1128,53 @@ func (p *parser) list(data []byte, flags ListType) int {
flags &= ^ListItemBeginningOfList flags &= ^ListItemBeginningOfList
} }
p.r.EndList(flags) above := block.Parent
finalizeList(block)
p.tip = above
return i return i
} }
// Returns true if block ends with a blank line, descending if needed
// into lists and sublists.
func endsWithBlankLine(block *Node) bool {
// TODO: figure this out. Always false now.
for block != nil {
//if block.lastLineBlank {
//return true
//}
t := block.Type
if t == List || t == Item {
block = block.LastChild
} else {
break
}
}
return false
}
func finalizeList(block *Node) {
block.open = false
item := block.FirstChild
for item != nil {
// check for non-final list item ending with blank line:
if endsWithBlankLine(item) && item.Next != nil {
block.ListData.Tight = false
break
}
// recurse into children of list item, to see if there are spaces
// between any of them:
subItem := item.FirstChild
for subItem != nil {
if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
block.ListData.Tight = false
break
}
subItem = subItem.Next
}
item = item.Next
}
}
// Parse a single list item. // Parse a single list item.
// Assumes initial prefix is already removed if this is a sublist. // Assumes initial prefix is already removed if this is a sublist.
func (p *parser) listItem(data []byte, flags *ListType) int { func (p *parser) listItem(data []byte, flags *ListType) int {
@ -1223,44 +1326,34 @@ gatherlines:
rawBytes := raw.Bytes() rawBytes := raw.Bytes()
block := p.addBlock(Item, nil)
block.ListData = &ListData{ // TODO: fill in the real ListData
Flags: *flags,
Tight: false,
BulletChar: '*',
Delimiter: 0,
}
// render the contents of the list item // render the contents of the list item
var cooked bytes.Buffer
if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 { if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
// intermediate render of block item, except for definition term // intermediate render of block item, except for definition term
if sublist > 0 { if sublist > 0 {
cooked.Write(p.r.CaptureWrites(func() { p.block(rawBytes[:sublist])
p.block(rawBytes[:sublist]) p.block(rawBytes[sublist:])
p.block(rawBytes[sublist:])
}))
} else { } else {
cooked.Write(p.r.CaptureWrites(func() { p.block(rawBytes)
p.block(rawBytes)
}))
} }
} else { } else {
// intermediate render of inline item // intermediate render of inline item
if sublist > 0 { if sublist > 0 {
cooked.Write(p.r.CaptureWrites(func() { child := p.addChild(Paragraph, 0)
p.inline(rawBytes[:sublist]) child.content = rawBytes[:sublist]
p.block(rawBytes[sublist:]) p.block(rawBytes[sublist:])
}))
} else { } else {
cooked.Write(p.r.CaptureWrites(func() { child := p.addChild(Paragraph, 0)
p.inline(rawBytes) child.content = rawBytes
}))
} }
} }
// render the actual list item
cookedBytes := cooked.Bytes()
parsedEnd := len(cookedBytes)
// strip trailing newlines
for parsedEnd > 0 && cookedBytes[parsedEnd-1] == '\n' {
parsedEnd--
}
p.r.ListItem(cookedBytes[:parsedEnd], *flags)
return line return line
} }
@ -1284,9 +1377,7 @@ func (p *parser) renderParagraph(data []byte) {
end-- end--
} }
p.r.BeginParagraph() p.addBlock(Paragraph, data[beg:end])
p.inline(data[beg:end])
p.r.EndParagraph()
} }
func (p *parser) paragraph(data []byte) int { func (p *parser) paragraph(data []byte) int {
@ -1335,11 +1426,9 @@ func (p *parser) paragraph(data []byte) int {
id = sanitized_anchor_name.Create(string(data[prev:eol])) id = sanitized_anchor_name.Create(string(data[prev:eol]))
} }
p.r.BeginHeader(level, id) block := p.addBlock(Header, data[prev:eol])
header := p.r.CopyWrites(func() { block.Level = uint32(level)
p.inline(data[prev:eol]) block.HeaderID = id
})
p.r.EndHeader(level, id, header)
// find the end of the underline // find the end of the underline
for data[i] != '\n' { for data[i] != '\n' {

View File

@ -228,6 +228,12 @@ type parser struct {
// presence. If a ref is also a footnote, it's stored both in refs and here // presence. If a ref is also a footnote, it's stored both in refs and here
// in notes. Slice is nil if footnotes not enabled. // in notes. Slice is nil if footnotes not enabled.
notes []*reference notes []*reference
doc *Node
tip *Node // = doc
oldTip *Node
lastMatchedContainer *Node // = doc
allClosed bool
} }
func (p *parser) getRef(refid string) (ref *reference, found bool) { func (p *parser) getRef(refid string) (ref *reference, found bool) {
@ -250,6 +256,34 @@ func (p *parser) getRef(refid string) (ref *reference, found bool) {
return ref, found return ref, found
} }
func (p *parser) finalize(block *Node) {
above := block.Parent
block.open = false
p.tip = above
}
func (p *parser) addChild(node NodeType, offset uint32) *Node {
for !p.tip.canContain(node) {
p.finalize(p.tip)
}
newNode := NewNode(node)
newNode.content = []byte{}
p.tip.appendChild(newNode)
p.tip = newNode
return newNode
}
func (p *parser) closeUnmatchedBlocks() {
if !p.allClosed {
for p.oldTip != p.lastMatchedContainer {
parent := p.oldTip.Parent
p.finalize(p.oldTip)
p.oldTip = parent
}
p.allClosed = true
}
}
// //
// //
// Public interface // Public interface
@ -366,6 +400,13 @@ func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
p.maxNesting = 16 p.maxNesting = 16
p.insideLink = false p.insideLink = false
docNode := NewNode(Document)
p.doc = docNode
p.tip = docNode
p.oldTip = docNode
p.lastMatchedContainer = docNode
p.allClosed = true
// register inline parsers // register inline parsers
p.inlineCallback['*'] = emphasis p.inlineCallback['*'] = emphasis
p.inlineCallback['_'] = emphasis p.inlineCallback['_'] = emphasis