Vulkan-Docs/scripts/reflow.py

#!/usr/bin/python3
#
# Copyright (c) 2016-2019 The Khronos Group Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Used for automatic reflow of Vulkan spec to satisfy the agreed layout to
# minimize git churn. Most of the logic has to do with detecting asciidoc
# markup or block types that *shouldn't* be reflowed (tables, code) and
# ignoring them. It's very likely there are many asciidoc constructs not yet
# accounted for in the script, our usage of asciidoc markup is intentionally
# somewhat limited.
#
# Also used to insert identifying tags on explicit Valid Usage statements.

# Usage: reflow.py [-noflow] [-tagvu] [-nextvu #] [-overwrite] [-out dir] [-suffix str] files
#   -noflow acts as a passthrough, instead of reflowing text. Other
#       processing may occur.
#   -tagvu generates explicit VUID tag for Valid Usage statements which
#       don't already have them.
#   -nextvu # starts VUID tag generation at the specified # instead of
#       the value wired into the reflow.py script.
#   -overwrite updates in place (can be risky, make sure there are backups)
#   -out specifies directory to create output file in, default 'out'
#   -suffix specifies suffix to add to output files, default ''
#   files are asciidoc source files from the Vulkan spec to reflow.

# For error and file-loading interfaces only
import argparse
import os
import re
import sys
from reflib import loadFile, logDiag, logWarn, setLogFile
from reflow_count import startVUID

# Vulkan-specific - will consolidate into scripts/ like OpenXR soon
sys.path.insert(0, 'xml')

import vkapi as api
from vkconventions import VulkanConventions as APIConventions
conventions = APIConventions()

# Markup that always ends a paragraph
#   empty line or whitespace
#   [block options]
#   [[anchor]]
#   //                  comment
#   <<<<                page break
#   :attribute-setting
#   macro-directive::terms
#   +                   standalone list item continuation
#   label::             labelled list - label must be standalone
endPara = re.compile(r'^( *|\[.*\]|//.*|<<<<|:.*|[a-z]+::.*|\+|.*::)$')

# Special case of markup ending a paragraph, used to track the current
# command/structure. This allows for either OpenXR or Vulkan API path
# conventions, and uses the file suffix defined by the API conventions.
includePat = re.compile(r'^include::(\.\./)+(generated/+)?api/+(?P<type>\w+)/(?P<name>\w+)'
                        + conventions.file_suffix + r'\[\]')

# Find the first pname: pattern in a Valid Usage statement
pnamePat = re.compile(r'pname:(?P<param>\w+)')

# Markup that's OK in a contiguous paragraph but otherwise passed through
#   .anything
#   === Section Titles
endParaContinue = re.compile(r'^(\..*|=+ .*)$')

# Markup for block delimiters whose contents *should* be reformatted
#   --   (exactly two)  (open block)
#   **** (4 or more)    (sidebar block - why do we have these?!)
#   ==== (4 or more)    (example block)
#   ____ (4 or more)    (quote block)
blockReflow = re.compile(r'^(--|[*=_]{4,})$')

# Markup for block delimiters whose contents should *not* be reformatted
#   |=== (3 or more)  (table)
#   ++++ (4 or more)  (passthrough block)
#   .... (4 or more)  (literal block)
#   //// (4 or more)  (comment block)
#   ---- (4 or more)  (listing block)
#   **** (4 or more)  (sidebar block)
blockPassthrough = re.compile(r'^(\|={3,}|[-+./]{4,})$')

# Markup for introducing bullet points (hanging paragraphs)
#   * bullet
#     ** bullet
#     -- bullet
#   . bullet
#   :: bullet
beginBullet = re.compile(r'^ *([*-.]+|::) ')

# Text that (may) not end sentences

# A single letter followed by a period, typically a middle initial.
endInitial = re.compile(r'^[A-Z]\.$')
# An abbreviation, which doesn't (usually) end a line.
endAbbrev = re.compile(r'(e\.g|i\.e|c\.f)\.$', re.IGNORECASE)

# State machine for reflowing.
#
# blockStack - The last element is a line with the asciidoc block delimiter
#   that's currently in effect, such as
#     '--', '----', '****', '======', or '+++++++++'.
#   This affects whether or not the block contents should be formatted.
# reflowStack - The last element is True or False if the current blockStack
#   contents should be reflowed.
# vuStack - the last element is True or False if the current blockStack
#   contents are an explicit Valid Usage block.
# margin - margin to reflow text to.
# para - list of lines in the paragraph being accumulated. When this is
#   non-empty, there is a current paragraph.
# lastTitle - true if the previous line was a document title line (e.g.
#   :leveloffset: 0 - no attempt to track changes to this is made).
# leadIndent - indent level (in spaces) of the first line of a paragraph.
# hangIndent - indent level of the remaining lines of a paragraph.
# file - file pointer to write to.
# filename - base name of file being read from.
# lineNumber - line number being read from the input file.
# breakPeriod - True if justification should break to a new line after
#   the end of a sentence.
# breakInitial - True if justification should break to a new line after
#   something that appears to be an initial in someone's name. **TBD**
# reflow - True if text should be reflowed, False to pass through unchanged.
# vuPrefix - Prefix of generated Valid Usage tags
# vuFormat - Format string for generating Valid Usage tags. First argument
#   is vuPrefix, second is command/struct name, third is parameter name,
#   fourth is the tag number.
# nextvu - Integer to start tagging un-numbered Valid Usage statements with,
#   or None if no tagging should be done.
# apiName - String name of a Vulkan structure or command for VUID tag
#   generation, or None if one hasn't been included in this file yet.
class ReflowState:
    """Represents the state of the reflow operation"""
    def __init__(self,
                 filename,
                 margin = 76,
                 file = sys.stdout,
                 breakPeriod = True,
                 reflow = True,
                 nextvu = None):
        self.blockStack = [ None ]
        self.reflowStack = [ True ]
        self.vuStack = [ False ]
        self.margin = margin
        self.para = []
        self.lastTitle = False
        self.leadIndent = 0
        self.hangIndent = 0
        self.file = file
        self.filename = filename
        self.lineNumber = 0
        self.breakPeriod = breakPeriod
        self.breakInitial = True
        self.reflow = reflow
        self.vuPrefix = 'VUID'
        self.vuFormat = '{0}-{1}-{2}-{3:0>5d}'
        self.nextvu = nextvu
        self.apiName = ''

    def incrLineNumber(self):
        self.lineNumber = self.lineNumber + 1

    # Print an array of lines with newlines already present
    def printLines(self, lines):
        logDiag(':: printLines:', len(lines), 'lines: ', lines[0], end='')
        for line in lines:
            print(line, file=self.file, end='')

    # Returns True if word ends with a sentence-period, False otherwise.
    # Allows for contraction cases which won't end a line:
    #  - A single letter (if breakInitial is True)
    #  - Abbreviations: 'c.f.', 'e.g.', 'i.e.' (or mixed-case versions)
    def endSentence(self, word):
        if (word[-1:] != '.' or
            endAbbrev.search(word) or
                (self.breakInitial and endInitial.match(word))):
            return False

        return True

    # Returns True if word is a Valid Usage ID Tag anchor.
    def vuidAnchor(self, word):
        return (word[0:7] == '[[VUID-')

    # Reflow the current paragraph, respecting the paragraph lead and
    # hanging indentation levels. The algorithm also respects trailing '+'
    # signs that indicate embedded newlines, and will not reflow a very long
    # word immediately after a bullet point.
    # Just return the paragraph unchanged if the -noflow argument was
    # given.
    def reflowPara(self):
        if not self.reflow:
            return self.para

        logDiag('reflowPara lead indent = ', self.leadIndent,
                'hangIndent =', self.hangIndent,
                'para:', self.para[0], end='')

        # Total words processed (we care about the *first* word vs. others)
        wordCount = 0

        # Tracks the *previous* word processed. It must not be empty.
        prevWord = ' '

        #import pdb; pdb.set_trace()

        for line in self.para:
            line = line.rstrip()
            words = line.split()

            # logDiag('reflowPara: input line =', line)
            numWords = len(words) - 1

            for i in range(0, numWords + 1):
                word = words[i]
                wordLen = len(word)
                wordCount += 1

                endEscape = False
                if i == numWords and word == '+':
                    # Trailing ' +' must stay on the same line
                    endEscape = word
                    # logDiag('reflowPara last word of line =', word, 'prevWord =', prevWord, 'endEscape =', endEscape)
                else:
                    pass
                    # logDiag('reflowPara wordCount =', wordCount, 'word =', word, 'prevWord =', prevWord)

                if wordCount == 1:
                    # The first word of the paragraph is treated specially.
                    # The loop logic becomes trickier if all this code is
                    # done prior to looping over lines and words, so all the
                    # setup logic is done here.

                    outPara = []
                    outLine = ''.ljust(self.leadIndent) + word
                    outLineLen = self.leadIndent + wordLen

                    # If the paragraph begins with a bullet point, generate
                    # a hanging indent level if there isn't one already.
                    if beginBullet.match(self.para[0]):
                        bulletPoint = True
                        if len(self.para) > 1:
                            logDiag('reflowPara first line matches bullet point',
                                    'but indent already hanging @ input line',
                                    self.lineNumber)
                        else:
                            logDiag('reflowPara first line matches bullet point -'
                                    'single line, assuming hangIndent @ input line',
                                    self.lineNumber)
                            self.hangIndent = outLineLen + 1
                    else:
                        bulletPoint = False
                else:
                    # Possible actions to take with this word
                    #
                    # addWord - add word to current line
                    # closeLine - append line and start a new (null) one
                    # startLine - add word to a new line

                    # Default behavior if all the tests below fail is to add
                    # this word to the current line, and keep accumulating
                    # that line.
                    (addWord, closeLine, startLine) = (True, False, False)

                    # How long would this line be if the word were added?
                    newLen = outLineLen + 1 + wordLen

                    # Are we on the first word following a bullet point?
                    firstBullet = (wordCount == 2 and bulletPoint)

                    if endEscape:
                        # If the new word ends the input line with ' +',
                        # add it to the current line.

                        (addWord, closeLine, startLine) = (True, True, False)
                    elif self.vuidAnchor(word):
                        # If the new word is a Valid Usage anchor, break the
                        # line afterwards. Note that this should only happen
                        # immediately after a bullet point, but we don't
                        # currently check for this.
                        (addWord, closeLine, startLine) = (True, True, False)
                    elif newLen > self.margin:
                        if firstBullet:
                            # If the word follows a bullet point, add it to
                            # the current line no matter its length.

                            (addWord, closeLine, startLine) = (True, True, False)
                        else:
                            # The word overflows, so add it to a new line.

                            (addWord, closeLine, startLine) = (False, True, True)
                    elif (self.breakPeriod and
                          (wordCount > 2 or not firstBullet) and
                          self.endSentence(prevWord)):
                        # If the previous word ends a sentence and
                        # breakPeriod is set, start a new line.
                        # The complicated logic allows for leading bullet
                        # points which are periods (implicitly numbered lists).
                        # @@@ But not yet for explicitly numbered lists.

                        (addWord, closeLine, startLine) = (False, True, True)

                    # Add a word to the current line
                    if addWord:
                        if outLine:
                            outLine += ' ' + word
                            outLineLen = newLen
                        else:
                            # Fall through to startLine case if there's no
                            # current line yet.
                            startLine = True

                    # Add current line to the output paragraph. Force
                    # starting a new line, although we don't yet know if it
                    # will ever have contents.
                    if closeLine:
                        if outLine:
                            outPara.append(outLine + '\n')
                            outLine = None

                    # Start a new line and add a word to it
                    if startLine:
                        outLine = ''.ljust(self.hangIndent) + word
                        outLineLen = self.hangIndent + wordLen

                # Track the previous word, for use in breaking at end of
                # a sentence
                prevWord = word

        # Add this line to the output paragraph.
        if outLine:
            outPara.append(outLine + '\n')

        return outPara

    # Emit a paragraph, possibly reflowing it depending on the block
    # context. Reset the paragraph accumulator.
    def emitPara(self):
        if self.para != []:
            if self.vuStack[-1] and self.nextvu is not None:
                # If:
                #   - this paragraph is in a Valid Usage block,
                #   - VUID tags are being assigned,
                # Try to assign VUIDs

                if nestedVuPat.search(self.para[0]):
                    # Check for nested bullet points. These should not be
                    # assigned VUIDs, nor present at all, because they break
                    # the VU extractor.
                    logWarn(self.filename + ': Invalid nested bullet point in VU block:', self.para[0])
                elif self.vuPrefix not in self.para[0]:
                    # If:
                    #   - a tag is not already present, and
                    #   - the paragraph is a properly marked-up list item
                    # Then add a VUID tag starting with the next free ID.

                    # Split the first line after the bullet point
                    matches = vuPat.search(self.para[0])
                    if matches is not None:
                        logDiag('findRefs: Matched vuPat on line:', self.para[0], end='')
                        head = matches.group('head')
                        tail = matches.group('tail')

                        # Use the first pname: statement in the paragraph as
                        # the parameter name in the VUID tag. This won't always
                        # be correct, but should be highly reliable.
                        for vuLine in self.para:
                            matches = pnamePat.search(vuLine)
                            if matches is not None:
                                break

                        if matches is not None:
                            paramName = matches.group('param')
                        else:
                            paramName = 'None'
                            logWarn(self.filename,
                                    'No param name found for VUID tag on line:',
                                    self.para[0])

                        newline = (head + ' [[' +
                                   self.vuFormat.format(self.vuPrefix,
                                                        self.apiName,
                                                        paramName,
                                                        self.nextvu) + ']] ' + tail)

                        logDiag('Assigning', self.vuPrefix, self.apiName, self.nextvu,
                                ' on line:', self.para[0], '->', newline, 'END')

                        self.para[0] = newline
                        self.nextvu = self.nextvu + 1
                # else:
                #     There are only a few cases of this, and they're all
                #     legitimate. Leave detecting this case to another tool
                #     or hand inspection.
                #     logWarn(self.filename + ': Unexpected non-bullet item in VU block (harmless if following an ifdef):',
                #             self.para[0])

            if self.reflowStack[-1]:
                self.printLines(self.reflowPara())
            else:
                self.printLines(self.para)

        # Reset the paragraph, including its indentation level
        self.para = []
        self.leadIndent = 0
        self.hangIndent = 0

    # 'line' ends a paragraph and should itself be emitted.
    # line may be None to indicate EOF or other exception.
    def endPara(self, line):
        logDiag('endPara line', self.lineNumber, ': emitting paragraph')

        # Emit current paragraph, this line, and reset tracker
        self.emitPara()

        if line:
            self.printLines( [ line ] )

    # 'line' ends a paragraph (unless there's already a paragraph being
    # accumulated, e.g. len(para) > 0 - currently not implemented)
    def endParaContinue(self, line):
        self.endPara(line)

    # 'line' begins or ends a block. If beginning a block, tag whether or
    # not to reflow the contents.
    # vuBlock is True if the previous line indicates this is a Valid Usage
    # block.
    def endBlock(self, line, reflow = False, vuBlock = False):
        self.endPara(line)

        if self.blockStack[-1] == line:
            logDiag('endBlock line', self.lineNumber,
                    ': popping block end depth:', len(self.blockStack),
                    ':', line, end='')
            self.blockStack.pop()
            self.reflowStack.pop()
            self.vuStack.pop()
        else:
            # Start a block
            self.blockStack.append(line)
            self.reflowStack.append(reflow)
            self.vuStack.append(vuBlock)

            logDiag('endBlock reflow =', reflow, ' line', self.lineNumber,
                    ': pushing block start depth', len(self.blockStack),
                    ':', line, end='')

    # 'line' begins or ends a block. The paragraphs in the block *should* be
    # reformatted (e.g. a NOTE).
    def endParaBlockReflow(self, line, vuBlock):
        self.endBlock(line, reflow = True, vuBlock = vuBlock)

    # 'line' begins or ends a block. The paragraphs in the block should
    # *not* be reformatted (e.g. a NOTE).
    def endParaBlockPassthrough(self, line):
        self.endBlock(line, reflow = False)

    # 'line' starts or continues a paragraph.
    # Paragraphs may have "hanging indent", e.g.
    #   * Bullet point...
    #     ... continued
    # In this case, when the higher indentation level ends, so does the
    # paragraph.
    def addLine(self, line):
        logDiag('addLine line', self.lineNumber, ':', line, end='')

        # See https://stackoverflow.com/questions/13648813/what-is-the-pythonic-way-to-count-the-leading-spaces-in-a-string
        indent = len(line) - len(line.lstrip())

        # A hanging paragraph ends due to a less-indented line.
        if self.para != [] and indent < self.hangIndent:
            logDiag('addLine: line reduces indentation, emit paragraph')
            self.emitPara()

        # A bullet point (or something that looks like one) always ends the
        # current paragraph.
        if beginBullet.match(line):
            logDiag('addLine: line matches beginBullet, emit paragraph')
            self.emitPara()

        if self.para == []:
            # Begin a new paragraph
            self.para = [ line ]
            self.leadIndent = indent
            self.hangIndent = indent
        else:
            # Add a line to a paragraph. Increase the hanging indentation
            # level - once.
            if self.hangIndent == self.leadIndent:
                self.hangIndent = indent
            self.para.append(line)

def reflowFile(filename, args):
    logDiag('reflow: filename', filename)

    lines = loadFile(filename)
    if lines is None:
        return

    # Output file handle and reflow object for this file. There are no race
    # conditions on overwriting the input, but it's not recommended unless
    # you have backing store such as git.

    if args.overwrite:
        outFilename = filename
    else:
        outFilename = args.outDir + '/' + os.path.basename(filename) + args.suffix

    try:
        fp = open(outFilename, 'w', encoding='utf8')
    except:
        logWarn('Cannot open output file', filename, ':', sys.exc_info()[0])
        return

    state = ReflowState(filename,
                        file = fp,
                        reflow = not args.noflow,
                        nextvu = args.nextvu)

    for line in lines:
        state.incrLineNumber()

        # Is this a title line (leading '= ' followed by text)?
        thisTitle = False

        # The logic here is broken. If we're in a non-reflowable block and
        # this line *doesn't* end the block, it should always be
        # accumulated.

        if endPara.match(line):
            # Ending a paragraph. Emit the current paragraph, if any, and
            # prepare to begin a new paragraph.

            state.endPara(line)

            # If this is an include:: line starting the definition of a
            # structure or command, track that for use in VUID generation.

            matches = includePat.search(line)
            if matches is not None:
                include_type = matches.group('type')
                if include_type in ('protos', 'structs'):
                    state.apiName = matches.group('name')

        elif endParaContinue.match(line):
            # For now, always just end the paragraph.
            # Could check see if len(para) > 0 to accumulate.

            state.endParaContinue(line)

            # If it's a title line, track that
            if line[0:2] == '= ':
                thisTitle = True

        elif blockReflow.match(line):
            # Starting or ending a block whose contents may be reflowed.
            # Blocks cannot be nested.

            # First see if this is an explicit Valid Usage block
            vuBlock = (state.lineNumber > 1 and
                       lines[state.lineNumber-2] == '.Valid Usage\n')

            state.endParaBlockReflow(line, vuBlock)
        elif blockPassthrough.match(line):
            # Starting or ending a block whose contents must not be reflowed.
            # These are tables, etc. Blocks cannot be nested.

            state.endParaBlockPassthrough(line)
        elif state.lastTitle:
            # The previous line was a document title line. This line
            # is the author / credits line and must not be reflowed.

            state.endPara(line)
        else:
            # Just accumulate a line to the current paragraph. Watch out for
            # hanging indents / bullet-points and track that indent level.

            state.addLine(line)

        state.lastTitle = thisTitle

    # Cleanup at end of file
    state.endPara(None)

    # Sanity check on block nesting
    if len(state.blockStack) > 1:
        logWarn('file', filename,
                'mismatched asciidoc block delimiters at EOF:',
                state.blockStack[-1])

    fp.close()

    # Update the 'nextvu' value
    if args.nextvu != state.nextvu:
        logWarn('Updated nextvu to', state.nextvu, 'after file', filename)
        args.nextvu = state.nextvu

def reflowAllAdocFiles(folder_to_reflow, args):
    for root, subdirs, files in os.walk(folder_to_reflow):
        for file in files:
            if file.endswith(conventions.file_suffix):
                file_path = os.path.join(root, file)
                reflowFile(file_path, args)
        for subdir in subdirs:
            sub_folder = os.path.join(root, subdir)
            print('Sub-folder = %s' % sub_folder)
            if subdir.lower() not in conventions.spec_no_reflow_dirs:
                print('   Parsing = %s' % sub_folder)
                reflowAllAdocFiles(sub_folder, args)
            else:
                print('   Skipping = %s' % sub_folder)

# Patterns used to recognize interesting lines in an asciidoc source file.
# These patterns are only compiled once.

# Explicit Valid Usage list item with one or more leading asterisks
# The re.DOTALL is needed to prevent vuPat.search() from stripping
# the trailing newline.
vuPat = re.compile(r'^(?P<head>  [*]+)( *)(?P<tail>.*)', re.DOTALL)

# Pattern matching leading nested bullet points
global nestedVuPat
nestedVuPat = re.compile(r'^  \*\*')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('-diag', action='store', dest='diagFile',
                        help='Set the diagnostic file')
    parser.add_argument('-warn', action='store', dest='warnFile',
                        help='Set the warning file')
    parser.add_argument('-log', action='store', dest='logFile',
                        help='Set the log file for both diagnostics and warnings')
    parser.add_argument('-overwrite', action='store_true',
                        help='Overwrite input filenames instead of writing different output filenames')
    parser.add_argument('-out', action='store', dest='outDir',
                        default='out',
                        help='Set the output directory in which updated files are generated (default: out)')
    parser.add_argument('-tagvu', action='store_true',
                        help='Tag un-tagged Valid Usage statements starting at the value wired into reflow.py')
    parser.add_argument('-nextvu', action='store', dest='nextvu', type=int,
                        default=None,
                        help='Tag un-tagged Valid Usage statements starting at the specified base VUID instead of the value wired into reflow.py')
    parser.add_argument('-noflow', action='store_true', dest='noflow',
                        help='Do not reflow text. Other actions may apply.')
    parser.add_argument('-suffix', action='store', dest='suffix',
                        default='',
                        help='Set the suffix added to updated file names (default: none)')
    parser.add_argument('files', metavar='filename', nargs='*',
                        help='a filename to reflow text in')
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')

    args = parser.parse_args()

    setLogFile(True,  True, args.logFile)
    setLogFile(True, False, args.diagFile)
    setLogFile(False, True, args.warnFile)

    if args.overwrite:
        logWarn('reflow.py: will overwrite all input files')

    if args.tagvu and args.nextvu is None:
        args.nextvu = startVUID

    if args.nextvu is not None:
        logWarn('Tagging untagged Valid Usage statements starting at', args.nextvu)

    # If no files are specified, reflow the entire specification chapters folder
    if not args.files:
        folder_to_reflow = os.getcwd()
        folder_to_reflow += '/' + conventions.spec_reflow_path
        reflowAllAdocFiles(folder_to_reflow, args)
    else:
        for file in args.files:
            reflowFile(file, args)

    if args.nextvu is not None and args.nextvu != startVUID:
        try:
            reflow_count_file_path = os.path.dirname(os.path.realpath(__file__))
            reflow_count_file_path += '/reflow_count.py'
            reflow_count_file = open(reflow_count_file_path, 'w', encoding='utf8')
            print('# The value to start tagging VU statements at, unless overridden by -nextvu\n', file=reflow_count_file, end='')
            count_string = 'startVUID = %d\n' % args.nextvu
            print(count_string, file=reflow_count_file, end='')
            reflow_count_file.close()
        except:
            logWarn('Cannot open output count file reflow_count.py', ':', sys.exc_info()[0])