Vulkan-Docs/doc/specs/vulkan/reflow.py

#!/usr/bin/python3
#
# Copyright (c) 2016 The Khronos Group Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Used for automatic reflow of Vulkan spec to satisfy the agreed layout to
# minimize git churn. Most of the logic has to to with detecting asciidoc
# markup or block types that *shouldn't* be reflowed (tables, code) and
# ignoring them. It's very likely there are many asciidoc constructs not yet
# accounted for in the script, our usage of asciidoc markup is intentionally
# somewhat limited.

# Usage: reflow.py [-overwrite] [-out dir] [-suffix str] files
#   -overwrite updates in place (usually not desired, and risky)
#   -out specifies directory to create output file in, default 'out'
#   -suffix specifies suffix to add to output files, default ''
#   files are asciidoc source files from the Vulkan spec to reflow.

# For error and file-loading interfaces only
from reflib import *

import argparse, copy, os, pdb, re, string, sys

# Markup that always ends a paragraph
#   empty line or whitespace
#   [block options]
#   [[anchor]]
#   //                  comment
#   <<<<                page break
#   :attribute-setting
#   macro-directive::terms
#   +                   standalone list item continuation
#   label::             labelled list - label must be standalone
endPara = re.compile('^( *|\[.*\]|//.*|<<<<|:.*|[a-z]+::.*|\+|.*::)$')

# Markup that's OK in a contiguous paragraph but otherwise passed through
#   .anything
#   === Section Titles
endParaContinue = re.compile('^(\..*|=+ .*)$')

# Markup for block delimiters whose contents *should* be reformatted
#   --   (exactly two)  (open block)
#   **** (4 or more)    (sidebar block - why do we have these?!)
#   ==== (4 or more)    (example block)
#   ____ (4 or more)    (quote block)
blockReflow = re.compile('^(--|[*=_]{4,})$')

# Markup for block delimiters whose contents should *not* be reformatted
#   |=== (3 or more)  (table)
#   ++++ (4 or more)  (passthrough block)
#   .... (4 or more)  (literal block)
#   //// (4 or more)  (comment block)
#   ---- (4 or more)  (listing block)
#   **** (4 or more)  (sidebar block)
blockPassthrough = re.compile('^(\|={3,}|[-+./]{4,})$')

# Markup for introducing bullet points (hanging paragraphs)
#   * bullet
#     ** bullet
#     -- bullet
#   . bullet
#   :: bullet
beginBullet = re.compile('^ *([*-]+|\.|::) ')

# Text that (may) not end sentences

# A single letter followed by a period, typically a middle initial.
endInitial = re.compile('^[[:upper:]]\.$')
# An abbreviation, which doesn't (usually) end a line.
endAbbrev = re.compile('(e\.g|i\.e|c\.f)\.$', re.IGNORECASE)

# State machine for reflowing.
#
# blockStack - The last element is a line with the asciidoc block delimiter
#   that's currently in effect, such as
#     '--', '----', '****', '======', or '+++++++++'.
#   This affects whether or not the block contents should be formatted.
# reflowStack - The last element is True or False if the current blockStack
#   contents should be reflowed
# margin - margin to reflow text to
# para - list of lines in the paragraph being accumulated. When this is
#   non-empty, there is a current paragraph.
# leadIndent - indent level (in spaces) of the first line of a paragraph.
# hangIndent - indent level of the remaining lines of a paragraph.
# file - file pointer to write to
# filename - base name of file being read from
# lineNumber - line number being read from the input file
# breakPeriod - True if justification should break to a new line after
#   the end of a sentence
# breakInitial - True if justification should break to a new
class ReflowState:
    """Represents the state of the reflow operation"""
    def __init__(self, filename, margin = 76, file = sys.stdout, breakPeriod = True):
        self.blockStack = [ None ]
        self.reflowStack = [ True ]
        self.margin = margin
        self.para = []
        self.leadIndent = 0
        self.hangIndent = 0
        self.file = file
        self.filename = filename
        self.lineNumber = 0
        self.breakPeriod = breakPeriod
        self.breakInitial = True

    def incrLineNumber(self):
        self.lineNumber = self.lineNumber + 1

    # Print an array of lines with newlines already present
    def printLines(self, lines):
        logDiag(':: printLines:', len(lines), 'lines: ', lines[0], end='')
        for line in lines:
            print(line, file=self.file, end='')

    # Returns True if word ends with a sentence-period, False otherwise. Allows
    # for contraction cases which won't end a line:
    #  - A single letter (if breakInitial is True)
    #  - Abbreviations: 'c.f.', 'e.g.', 'i.e.' (or mixed-case versions)
    def endSentence(self,word):
        if (word[-1:] != '.' or
            endAbbrev.search(word) or
            (self.breakInitial and endInitial.match(word))):
            return False
        else:
            return True

    # Reflow the current paragraph, respecting the paragraph lead and
    # hanging indentation levels. The algorithm also respects trailing '+'
    # signs that indicate imbedded newlines, and will not reflow a very long
    # word immediately after a bullet point.
    def reflowPara(self):
        logDiag('reflowPara lead indent = ', self.leadIndent,
                'hangIndent =', self.hangIndent,
                'para:', self.para[0], end='')

        # Total words processed (we care about the *first* word vs. others)
        wordCount = 0

        # Tracks the *previous* word processed. It must not be empty.
        prevWord = ' '

        #import pdb; pdb.set_trace()

        for line in self.para:
            line = line.rstrip()
            words = line.split()

            # logDiag('reflowPara: input line =', line)
            numWords = len(words) - 1

            for i in range(0, numWords + 1):
                word = words[i]
                wordLen = len(word)
                wordCount += 1

                endEscape = False
                if (i == numWords and word == '+'):
                    # Trailing ' +' must stay on the same line
                    endEscape = word
                    # logDiag('reflowPara last word of line =', word, 'prevWord =', prevWord, 'endEscape =', endEscape)
                else:
                    True
                    # logDiag('reflowPara wordCount =', wordCount, 'word =', word, 'prevWord =', prevWord)

                if wordCount == 1:
                    # The first word of the paragraph is treated specially.
                    # The loop logic becomes trickier if all this code is
                    # done prior to looping over lines and words, so all the
                    # setup logic is done here.

                    outPara = []
                    outLine = ''.ljust(self.leadIndent) + word
                    outLineLen = self.leadIndent + wordLen

                    # If the paragraph begins with a bullet point, generate
                    # a hanging indent level if there isn't one already.
                    if beginBullet.match(self.para[0]):
                        bulletPoint = True
                        if len(self.para) > 1:
                            logDiag('reflowPara first line matches bullet point',
                                    'but indent already hanging @ input line',
                                    self.lineNumber)
                        else:
                            logDiag('reflowPara first line matches bullet point -'
                                    'single line, assuming hangIndent @ input line',
                                    self.lineNumber)
                            self.hangIndent = outLineLen + 1
                    else:
                        bulletPoint = False
                else:
                    # Possible actions to take with this word
                    #
                    # addWord - add word to current line
                    # closeLine - append line and start a new (null) one
                    # startLine - add word to a new line

                    # Default behavior if all the tests below fail is to add
                    # this word to the current line, and keep accumulating
                    # that line.
                    (addWord, closeLine, startLine) = (True, False, False)

                    # How long would this line be if the word were added?
                    newLen = outLineLen + 1 + wordLen

                    # Are we on the first word following a bullet point?
                    firstBullet = (wordCount == 2 and bulletPoint)

                    if (endEscape):
                        # If the new word ends the input line with ' +',
                        # add it to the current line.

                        (addWord, closeLine, startLine) = (True, True, False)
                    elif newLen > self.margin:
                        if firstBullet:
                            # If the word follows a bullet point, add it to
                            # the current line no matter its length.

                            (addWord, closeLine, startLine) = (True, True, False)
                        else:
                            # The word overflows, so add it to a new line.

                            (addWord, closeLine, startLine) = (False, True, True)
                    elif (self.breakPeriod and
                          (wordCount > 2 or not firstBullet) and
                          self.endSentence(prevWord)):
                        # If the previous word ends a sentence and
                        # breakPeriod is set, start a new line.
                        # The complicated logic allows for leading bullet
                        # points which are periods (implicitly numbered lists).
                        # @@@ But not yet for explicitly numbered lists.

                        (addWord, closeLine, startLine) = (False, True, True)

                    # Add a word to the current line
                    if addWord:
                        if outLine:
                            outLine += ' ' + word
                            outLineLen = newLen
                        else:
                            # Fall through to startLine case if there's no
                            # current line yet.
                            startLine = True

                    # Add current line to the output paragraph. Force
                    # starting a new line, although we don't yet know if it
                    # will ever have contents.
                    if closeLine:
                        if outLine:
                            outPara.append(outLine + '\n')
                            outLine = None

                    # Start a new line and add a word to it
                    if startLine:
                        outLine = ''.ljust(self.hangIndent) + word
                        outLineLen = self.hangIndent + wordLen

                # Track the previous word, for use in breaking at end of
                # a sentence
                prevWord = word

        # Add this line to the output paragraph.
        if (outLine):
            outPara.append(outLine + '\n')

        return outPara

    # Emit a paragraph, possibly reflowing it depending on the block
    # context. Reset the paragraph accumulator.
    def emitPara(self):
        if self.para != []:
            if self.reflowStack[-1]:
                self.printLines(self.reflowPara())
            else:
                self.printLines(self.para)

        # Reset the paragraph, including its indentation level
        self.para = []
        self.leadIndent = 0
        self.hangIndent = 0

    # 'line' ends a paragraph and should itself be emitted.
    # line may be None to indicate EOF or other exception.
    def endPara(self, line):
        logDiag('endPara line', self.lineNumber, ': emitting paragraph')

        # Emit current paragraph, this line, and reset tracker
        self.emitPara()

        if line:
            self.printLines( [ line ] )

    # 'line' ends a paragraph (unless there's already a paragraph being
    # accumulated, e.g. len(para) > 0 - currently not implemented)
    def endParaContinue(self, line):
        self.endPara(line)

    # 'line' begins or ends a block. If beginning a block, tag whether or
    # not to reflow the contents.
    def endBlock(self, line, reflow = False):
        self.endPara(line)

        if self.blockStack[-1] == line:
            logDiag('endBlock line', self.lineNumber,
                    ': popping block end depth:', len(self.blockStack),
                    ':', line, end='')
            self.blockStack.pop()
            self.reflowStack.pop()
        else:
            # Start a block
            self.blockStack.append(line)
            self.reflowStack.append(reflow)

            logDiag('endBlock reflow =', reflow, ' line', self.lineNumber,
                    ': pushing block start depth', len(self.blockStack),
                    ':', line, end='')

    # 'line' begins or ends a block. The paragraphs in the block *should* be
    # reformatted (e.g. a NOTE).
    def endParaBlockReflow(self, line):
        self.endBlock(line, reflow = True)

    # 'line' begins or ends a block. The paragraphs in the block should
    # *not* be reformatted (e.g. a NOTE).
    def endParaBlockPassthrough(self, line):
        self.endBlock(line, reflow = False)

    # 'line' starts or continues a paragraph.
    # Paragraphs may have "hanging indent", e.g.
    #   * Bullet point...
    #     ... continued
    # In this case, when the higher indentation level ends, so does the
    # paragraph.
    def addLine(self, line):
        logDiag('addLine line', self.lineNumber, ':', line, end='')

        # See https://stackoverflow.com/questions/13648813/what-is-the-pythonic-way-to-count-the-leading-spaces-in-a-string
        indent = len(line) - len(line.lstrip())

        # A hanging paragraph ends due to a less-indented line.
        if self.para != [] and indent < self.hangIndent:
            logDiag('addLine: line reduces indentation, emit paragraph')
            self.emitPara()

        # A bullet point (or something that looks like one) always ends the
        # current paragraph.
        if beginBullet.match(line):
            logDiag('addLine: line matches beginBullet, emit paragraph')
            self.emitPara()

        if self.para == []:
            # Begin a new paragraph
            self.para = [ line ]
            self.leadIndent = indent
            self.hangIndent = indent
        else:
            # Add a line to a paragraph. Increase the hanging indentation
            # level - once.
            if self.hangIndent == self.leadIndent:
                self.hangIndent = indent
            self.para.append(line)

def reflowFile(filename, overwrite, outDir, suffix):
    logDiag('reflow: filename', filename)

    lines = loadFile(filename)
    if (lines == None):
        return

    # Output file handle and reflow object for this file. There are no race
    # conditions on overwriting the input, but it's not recommended unless
    # you have backing store such as git.

    if overwrite:
        outFilename = filename
    else:
        outFilename = outDir + '/' + os.path.basename(filename) + suffix

    try:
        fp = open(outFilename, 'w', encoding='utf8')
    except:
        logWarn('Cannot open output file', filename, ':', sys.exc_info()[0])
        return None

    state = ReflowState(filename, file = fp)

    for line in lines:
        state.incrLineNumber()

        # The logic here is broken. If we're in a non-reflowable block and
        # this line *doesn't* end the block, it should always be
        # accumulated.

        if endPara.match(line):
            # Ending a paragraph. Emit the current paragraph, if any, and
            # prepare to begin a new paragraph.

            state.endPara(line)
        elif endParaContinue.match(line):
            # For now, always just end the paragraph.
            # Could check see if len(para) > 0 to accumulate.

            state.endParaContinue(line)
        elif blockReflow.match(line):
            # Starting or ending a block whose contents may be reflowed.
            # Blocks cannot be nested.

            state.endParaBlockReflow(line)
        elif blockPassthrough.match(line):
            # Starting or ending a block whose contents must not be reflowed.
            # These are tables, etc. Blocks cannot be nested.

            state.endParaBlockPassthrough(line)
        else:
            # Just accumulate a line to the current paragraph. Watch out for
            # hanging indents / bullet-points and track that indent level.

            state.addLine(line)

    # Cleanup at end of file
    state.endPara(None)

    # Sanity check on block nesting
    if len(state.blockStack) > 1:
        logWarn('file', filename,
                'mismatched asciidoc block delimiters at EOF:',
                state.blockStack[-1])

    fp.close()

# Patterns used to recognize interesting lines in an asciidoc source file.
# These patterns are only compiled once.

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('-diag', action='store', dest='diagFile',
                        help='Set the diagnostic file')
    parser.add_argument('-warn', action='store', dest='warnFile',
                        help='Set the warning file')
    parser.add_argument('-log', action='store', dest='logFile',
                        help='Set the log file for both diagnostics and warnings')
    parser.add_argument('-overwrite', action='store_true',
                        help='Overwrite input filenames instead of writing different output filenames')
    parser.add_argument('-out', action='store', dest='outDir',
                        default='out',
                        help='Set the output directory in which updated files are generated (default: out)')
    parser.add_argument('-suffix', action='store', dest='suffix',
                        default='',
                        help='Set the suffix added to updated file names (default: none)')
    parser.add_argument('files', metavar='filename', nargs='*',
                        help='a filename to reflow text in')
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')

    results = parser.parse_args()

    setLogFile(True,  True, results.logFile)
    setLogFile(True, False, results.diagFile)
    setLogFile(False, True, results.warnFile)

    if (results.overwrite):
        logWarn('reflow.py: will overwrite all input files')

    for file in results.files:
        reflowFile(file, results.overwrite, results.outDir, results.suffix)