Construct AST WIP: add block-level nodes

Build a partial tree by adding block nodes. The block nodes will then be
traversed and inline markdown parsed inside each of them. Tests are
broken at this point until the full tree is constructed.
This commit is contained in:
Vytautas Šaltenis 2016-03-30 12:57:12 +03:00
parent 94893247d1
commit 7c95b7a189
2 changed files with 232 additions and 102 deletions

293
block.go
View File

@ -15,10 +15,23 @@ package blackfriday
import (
"bytes"
"html"
"regexp"
"github.com/shurcooL/sanitized_anchor_name"
)
const (
Entity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
Escapable = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
)
var (
reBackslashOrAmp = regexp.MustCompile("[\\&]")
reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + Escapable + "|" + Entity)
reTrailingWhitespace = regexp.MustCompile("(\n *)+$")
)
// Parse block-level data.
// Note: this function and many that it calls assume that
// the input buffer ends with a newline.
@ -116,7 +129,7 @@ func (p *parser) block(data []byte) {
// or
// ______
if p.isHRule(data) {
p.r.HRule()
p.addBlock(HorizontalRule, nil)
var i int
for i = 0; data[i] != '\n'; i++ {
}
@ -189,6 +202,13 @@ func (p *parser) block(data []byte) {
p.nesting--
}
func (p *parser) addBlock(typ NodeType, content []byte) *Node {
p.closeUnmatchedBlocks()
container := p.addChild(typ, 0)
container.content = content
return container
}
func (p *parser) isPrefixHeader(data []byte) bool {
if data[0] != '#' {
return false
@ -245,11 +265,9 @@ func (p *parser) prefixHeader(data []byte) int {
if id == "" && p.flags&AutoHeaderIDs != 0 {
id = sanitized_anchor_name.Create(string(data[i:end]))
}
p.r.BeginHeader(level, id)
header := p.r.CopyWrites(func() {
p.inline(data[i:end])
})
p.r.EndHeader(level, id, header)
block := p.addBlock(Header, data[i:end])
block.HeaderID = id
block.Level = uint32(level)
}
return skip
}
@ -294,9 +312,14 @@ func (p *parser) titleBlock(data []byte, doRender bool) int {
}
data = bytes.Join(splitData[0:i], []byte("\n"))
p.r.TitleBlock(data)
consumed := len(data)
data = bytes.TrimPrefix(data, []byte("% "))
data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
block := p.addBlock(Header, data)
block.Level = 1
block.IsTitleblock = true
return len(data)
return consumed
}
func (p *parser) html(data []byte, doRender bool) int {
@ -391,12 +414,17 @@ func (p *parser) html(data []byte, doRender bool) int {
for end > 0 && data[end-1] == '\n' {
end--
}
p.r.BlockHtml(data[:end])
finalizeHtmlBlock(p.addBlock(HtmlBlock, data[:end]))
}
return i
}
func finalizeHtmlBlock(block *Node) {
block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{})
block.content = []byte{}
}
// HTML comment, lax form
func (p *parser) htmlComment(data []byte, doRender bool) int {
i := p.inlineHtmlComment(data)
@ -409,7 +437,8 @@ func (p *parser) htmlComment(data []byte, doRender bool) int {
for end > 0 && data[end-1] == '\n' {
end--
}
p.r.BlockHtml(data[:end])
block := p.addBlock(HtmlBlock, data[:end])
finalizeHtmlBlock(block)
}
return size
}
@ -441,7 +470,7 @@ func (p *parser) htmlHr(data []byte, doRender bool) int {
for end > 0 && data[end-1] == '\n' {
end--
}
p.r.BlockHtml(data[:end])
finalizeHtmlBlock(p.addBlock(HtmlBlock, data[:end]))
}
return size
}
@ -464,7 +493,9 @@ func (p *parser) htmlFindTag(data []byte) (string, bool) {
func (p *parser) htmlFindEnd(tag string, data []byte) int {
// assume data[0] == '<' && data[1] == '/' already tested
if tag == "hr" {
return 2
}
// check if tag is a match
closetag := []byte("</" + tag + ">")
if !bytes.HasPrefix(data, closetag) {
@ -642,6 +673,10 @@ func (p *parser) fencedCode(data []byte, doRender bool) int {
}
var work bytes.Buffer
if lang != nil {
work.Write([]byte(*lang))
work.WriteByte('\n')
}
for {
// safe to assume beg < len(data)
@ -668,48 +703,76 @@ func (p *parser) fencedCode(data []byte, doRender bool) int {
beg = end
}
syntax := ""
if lang != nil {
syntax = *lang
}
//syntax := ""
//if lang != nil {
// syntax = *lang
//}
if doRender {
p.r.BlockCode(work.Bytes(), syntax)
block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
block.IsFenced = true
finalizeCodeBlock(block)
}
return beg
}
func unescapeChar(str []byte) []byte {
if str[0] == '\\' {
return []byte{str[1]}
}
return []byte(html.UnescapeString(string(str)))
}
func unescapeString(str []byte) []byte {
if reBackslashOrAmp.Match(str) {
return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
} else {
return str
}
}
func finalizeCodeBlock(block *Node) {
if block.IsFenced {
newlinePos := bytes.IndexByte(block.content, '\n')
firstLine := block.content[:newlinePos]
rest := block.content[newlinePos+1:]
block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
block.Literal = rest
} else {
block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{'\n'})
}
block.content = nil
}
func (p *parser) table(data []byte) int {
var header bytes.Buffer
i, columns := p.tableHeader(&header, data)
table := p.addBlock(Table, nil)
i, columns := p.tableHeader(data)
if i == 0 {
p.tip = table.Parent
table.unlink()
return 0
}
var body bytes.Buffer
p.addBlock(TableBody, nil)
body.Write(p.r.CaptureWrites(func() {
for i < len(data) {
pipes, rowStart := 0, i
for ; data[i] != '\n'; i++ {
if data[i] == '|' {
pipes++
}
for i < len(data) {
pipes, rowStart := 0, i
for ; data[i] != '\n'; i++ {
if data[i] == '|' {
pipes++
}
if pipes == 0 {
i = rowStart
break
}
// include the newline in data sent to tableRow
i++
p.tableRow(data[rowStart:i], columns, false)
}
}))
p.r.Table(header.Bytes(), body.Bytes(), columns)
if pipes == 0 {
i = rowStart
break
}
// include the newline in data sent to tableRow
i++
p.tableRow(data[rowStart:i], columns, false)
}
return i
}
@ -723,7 +786,7 @@ func isBackslashEscaped(data []byte, i int) bool {
return backslashes&1 == 1
}
func (p *parser) tableHeader(out *bytes.Buffer, data []byte) (size int, columns []int) {
func (p *parser) tableHeader(data []byte) (size int, columns []int) {
i := 0
colCount := 1
for i = 0; data[i] != '\n'; i++ {
@ -821,16 +884,15 @@ func (p *parser) tableHeader(out *bytes.Buffer, data []byte) (size int, columns
return
}
out.Write(p.r.CaptureWrites(func() {
p.tableRow(header, columns, true)
}))
p.addBlock(TableHead, nil)
p.tableRow(header, columns, true)
size = i + 1
return
}
func (p *parser) tableRow(data []byte, columns []int, header bool) {
p.addBlock(TableRow, nil)
i, col := 0, 0
var rowWork bytes.Buffer
if data[i] == '|' && !isBackslashEscaped(data, i) {
i++
@ -856,29 +918,19 @@ func (p *parser) tableRow(data []byte, columns []int, header bool) {
cellEnd--
}
cellWork := p.r.CaptureWrites(func() {
p.inline(data[cellStart:cellEnd])
})
if header {
p.r.TableHeaderCell(&rowWork, cellWork, columns[col])
} else {
p.r.TableCell(&rowWork, cellWork, columns[col])
}
cell := p.addBlock(TableCell, data[cellStart:cellEnd])
cell.IsHeader = header
cell.Align = columns[col]
}
// pad it out with empty columns to get the right number
for ; col < len(columns); col++ {
if header {
p.r.TableHeaderCell(&rowWork, nil, columns[col])
} else {
p.r.TableCell(&rowWork, nil, columns[col])
}
cell := p.addBlock(TableCell, nil)
cell.IsHeader = header
cell.Align = columns[col]
}
// silently ignore rows with too many cells
p.r.TableRow(rowWork.Bytes())
}
// returns blockquote prefix length
@ -910,6 +962,7 @@ func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
// parse a blockquote fragment
func (p *parser) quote(data []byte) int {
block := p.addBlock(BlockQuote, nil)
var raw bytes.Buffer
beg, end := 0, 0
for beg < len(data) {
@ -928,22 +981,18 @@ func (p *parser) quote(data []byte) int {
end++
}
end++
if pre := p.quotePrefix(data[beg:]); pre > 0 {
// skip the prefix
beg += pre
} else if p.terminateBlockquote(data, beg, end) {
break
}
// this line is part of the blockquote
raw.Write(data[beg:end])
beg = end
}
p.r.BlockQuote(p.r.CaptureWrites(func() {
p.block(raw.Bytes())
}))
p.block(raw.Bytes())
p.finalize(block)
return end
}
@ -995,7 +1044,9 @@ func (p *parser) code(data []byte) int {
work.WriteByte('\n')
p.r.BlockCode(work.Bytes(), "")
block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
block.IsFenced = false
finalizeCodeBlock(block)
return i
}
@ -1057,10 +1108,19 @@ func (p *parser) dliPrefix(data []byte) int {
func (p *parser) list(data []byte, flags ListType) int {
i := 0
flags |= ListItemBeginningOfList
p.r.BeginList(flags)
block := p.addBlock(List, nil)
block.ListData = &ListData{ // TODO: fill in the real ListData
Flags: flags,
Tight: true,
BulletChar: '*',
Delimiter: 0,
}
for i < len(data) {
skip := p.listItem(data[i:], &flags)
if flags&ListItemContainsBlock != 0 {
block.ListData.Tight = false
}
i += skip
if skip == 0 || flags&ListItemEndOfList != 0 {
break
@ -1068,10 +1128,53 @@ func (p *parser) list(data []byte, flags ListType) int {
flags &= ^ListItemBeginningOfList
}
p.r.EndList(flags)
above := block.Parent
finalizeList(block)
p.tip = above
return i
}
// Returns true if block ends with a blank line, descending if needed
// into lists and sublists.
func endsWithBlankLine(block *Node) bool {
// TODO: figure this out. Always false now.
for block != nil {
//if block.lastLineBlank {
//return true
//}
t := block.Type
if t == List || t == Item {
block = block.LastChild
} else {
break
}
}
return false
}
func finalizeList(block *Node) {
block.open = false
item := block.FirstChild
for item != nil {
// check for non-final list item ending with blank line:
if endsWithBlankLine(item) && item.Next != nil {
block.ListData.Tight = false
break
}
// recurse into children of list item, to see if there are spaces
// between any of them:
subItem := item.FirstChild
for subItem != nil {
if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
block.ListData.Tight = false
break
}
subItem = subItem.Next
}
item = item.Next
}
}
// Parse a single list item.
// Assumes initial prefix is already removed if this is a sublist.
func (p *parser) listItem(data []byte, flags *ListType) int {
@ -1223,44 +1326,34 @@ gatherlines:
rawBytes := raw.Bytes()
block := p.addBlock(Item, nil)
block.ListData = &ListData{ // TODO: fill in the real ListData
Flags: *flags,
Tight: false,
BulletChar: '*',
Delimiter: 0,
}
// render the contents of the list item
var cooked bytes.Buffer
if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
// intermediate render of block item, except for definition term
if sublist > 0 {
cooked.Write(p.r.CaptureWrites(func() {
p.block(rawBytes[:sublist])
p.block(rawBytes[sublist:])
}))
p.block(rawBytes[:sublist])
p.block(rawBytes[sublist:])
} else {
cooked.Write(p.r.CaptureWrites(func() {
p.block(rawBytes)
}))
p.block(rawBytes)
}
} else {
// intermediate render of inline item
if sublist > 0 {
cooked.Write(p.r.CaptureWrites(func() {
p.inline(rawBytes[:sublist])
p.block(rawBytes[sublist:])
}))
child := p.addChild(Paragraph, 0)
child.content = rawBytes[:sublist]
p.block(rawBytes[sublist:])
} else {
cooked.Write(p.r.CaptureWrites(func() {
p.inline(rawBytes)
}))
child := p.addChild(Paragraph, 0)
child.content = rawBytes
}
}
// render the actual list item
cookedBytes := cooked.Bytes()
parsedEnd := len(cookedBytes)
// strip trailing newlines
for parsedEnd > 0 && cookedBytes[parsedEnd-1] == '\n' {
parsedEnd--
}
p.r.ListItem(cookedBytes[:parsedEnd], *flags)
return line
}
@ -1284,9 +1377,7 @@ func (p *parser) renderParagraph(data []byte) {
end--
}
p.r.BeginParagraph()
p.inline(data[beg:end])
p.r.EndParagraph()
p.addBlock(Paragraph, data[beg:end])
}
func (p *parser) paragraph(data []byte) int {
@ -1335,11 +1426,9 @@ func (p *parser) paragraph(data []byte) int {
id = sanitized_anchor_name.Create(string(data[prev:eol]))
}
p.r.BeginHeader(level, id)
header := p.r.CopyWrites(func() {
p.inline(data[prev:eol])
})
p.r.EndHeader(level, id, header)
block := p.addBlock(Header, data[prev:eol])
block.Level = uint32(level)
block.HeaderID = id
// find the end of the underline
for data[i] != '\n' {

View File

@ -228,6 +228,12 @@ type parser struct {
// presence. If a ref is also a footnote, it's stored both in refs and here
// in notes. Slice is nil if footnotes not enabled.
notes []*reference
doc *Node
tip *Node // = doc
oldTip *Node
lastMatchedContainer *Node // = doc
allClosed bool
}
func (p *parser) getRef(refid string) (ref *reference, found bool) {
@ -250,6 +256,34 @@ func (p *parser) getRef(refid string) (ref *reference, found bool) {
return ref, found
}
func (p *parser) finalize(block *Node) {
above := block.Parent
block.open = false
p.tip = above
}
func (p *parser) addChild(node NodeType, offset uint32) *Node {
for !p.tip.canContain(node) {
p.finalize(p.tip)
}
newNode := NewNode(node)
newNode.content = []byte{}
p.tip.appendChild(newNode)
p.tip = newNode
return newNode
}
func (p *parser) closeUnmatchedBlocks() {
if !p.allClosed {
for p.oldTip != p.lastMatchedContainer {
parent := p.oldTip.Parent
p.finalize(p.oldTip)
p.oldTip = parent
}
p.allClosed = true
}
}
//
//
// Public interface
@ -366,6 +400,13 @@ func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
p.maxNesting = 16
p.insideLink = false
docNode := NewNode(Document)
p.doc = docNode
p.tip = docNode
p.oldTip = docNode
p.lastMatchedContainer = docNode
p.allClosed = true
// register inline parsers
p.inlineCallback['*'] = emphasis
p.inlineCallback['_'] = emphasis