# $Id$ # Author: David Goodger # Copyright: This module has been placed in the public domain. """ This is the ``docutils.parsers.rst.states`` module, the core of the reStructuredText parser. It defines the following: :Classes: - `RSTStateMachine`: reStructuredText parser's entry point. - `NestedStateMachine`: recursive StateMachine. - `RSTState`: reStructuredText State superclass. - `Inliner`: For parsing inline markup. - `Body`: Generic classifier of the first line of a block. - `SpecializedBody`: Superclass for compound element members. - `BulletList`: Second and subsequent bullet_list list_items - `DefinitionList`: Second+ definition_list_items. - `EnumeratedList`: Second+ enumerated_list list_items. - `FieldList`: Second+ fields. - `OptionList`: Second+ option_list_items. - `RFC2822List`: Second+ RFC2822-style fields. - `ExtensionOptions`: Parses directive option fields. - `Explicit`: Second+ explicit markup constructs. - `SubstitutionDef`: For embedded directives in substitution definitions. - `Text`: Classifier of second line of a text block. - `SpecializedText`: Superclass for continuation lines of Text-variants. - `Definition`: Second line of potential definition_list_item. - `Line`: Second line of overlined section title or transition marker. - `Struct`: An auxiliary collection class. :Exception classes: - `MarkupError` - `ParserError` - `MarkupMismatch` :Functions: - `escape2null()`: Return a string, escape-backslashes converted to nulls. - `unescape()`: Return a string, nulls removed or restored to backslashes. :Attributes: - `state_classes`: set of State classes used with `RSTStateMachine`. Parser Overview =============== The reStructuredText parser is implemented as a recursive state machine, examining its input one line at a time. To understand how the parser works, please first become familiar with the `docutils.statemachine` module. In the description below, references are made to classes defined in this module; please see the individual classes for details. Parsing proceeds as follows: 1. The state machine examines each line of input, checking each of the transition patterns of the state `Body`, in order, looking for a match. The implicit transitions (blank lines and indentation) are checked before any others. The 'text' transition is a catch-all (matches anything). 2. The method associated with the matched transition pattern is called. A. Some transition methods are self-contained, appending elements to the document tree (`Body.doctest` parses a doctest block). The parser's current line index is advanced to the end of the element, and parsing continues with step 1. B. Other transition methods trigger the creation of a nested state machine, whose job is to parse a compound construct ('indent' does a block quote, 'bullet' does a bullet list, 'overline' does a section [first checking for a valid section header], etc.). - In the case of lists and explicit markup, a one-off state machine is created and run to parse contents of the first item. - A new state machine is created and its initial state is set to the appropriate specialized state (`BulletList` in the case of the 'bullet' transition; see `SpecializedBody` for more detail). This state machine is run to parse the compound element (or series of explicit markup elements), and returns as soon as a non-member element is encountered. For example, the `BulletList` state machine ends as soon as it encounters an element which is not a list item of that bullet list. The optional omission of inter-element blank lines is enabled by this nested state machine. - The current line index is advanced to the end of the elements parsed, and parsing continues with step 1. C. The result of the 'text' transition depends on the next line of text. The current state is changed to `Text`, under which the second line is examined. If the second line is: - Indented: The element is a definition list item, and parsing proceeds similarly to step 2.B, using the `DefinitionList` state. - A line of uniform punctuation characters: The element is a section header; again, parsing proceeds as in step 2.B, and `Body` is still used. - Anything else: The element is a paragraph, which is examined for inline markup and appended to the parent element. Processing continues with step 1. """ __docformat__ = 'reStructuredText' import sys import re from types import FunctionType, MethodType from docutils import nodes, statemachine, utils from docutils import ApplicationError, DataError from docutils.statemachine import StateMachineWS, StateWS from docutils.nodes import fully_normalize_name as normalize_name from docutils.nodes import whitespace_normalize_name import docutils.parsers.rst from docutils.parsers.rst import directives, languages, tableparser, roles from docutils.parsers.rst.languages import en as _fallback_language_module from docutils.utils import escape2null, unescape, column_width from docutils.utils import punctuation_chars, roman, urischemes class MarkupError(DataError): pass class UnknownInterpretedRoleError(DataError): pass class InterpretedRoleNotImplementedError(DataError): pass class ParserError(ApplicationError): pass class MarkupMismatch(Exception): pass class Struct: """Stores data attributes for dotted-attribute access.""" def __init__(self, **keywordargs): self.__dict__.update(keywordargs) class RSTStateMachine(StateMachineWS): """ reStructuredText's master StateMachine. The entry point to reStructuredText parsing is the `run()` method. """ def run(self, input_lines, document, input_offset=0, match_titles=True, inliner=None): """ Parse `input_lines` and modify the `document` node in place. Extend `StateMachineWS.run()`: set up parse-global data and run the StateMachine. """ self.language = languages.get_language( document.settings.language_code) self.match_titles = match_titles if inliner is None: inliner = Inliner() inliner.init_customizations(document.settings) self.memo = Struct(document=document, reporter=document.reporter, language=self.language, title_styles=[], section_level=0, section_bubble_up_kludge=False, inliner=inliner) self.document = document self.attach_observer(document.note_source) self.reporter = self.memo.reporter self.node = document results = StateMachineWS.run(self, input_lines, input_offset, input_source=document['source']) assert results == [], 'RSTStateMachine.run() results should be empty!' self.node = self.memo = None # remove unneeded references class NestedStateMachine(StateMachineWS): """ StateMachine run from within other StateMachine runs, to parse nested document structures. """ def run(self, input_lines, input_offset, memo, node, match_titles=True): """ Parse `input_lines` and populate a `docutils.nodes.document` instance. Extend `StateMachineWS.run()`: set up document-wide data. """ self.match_titles = match_titles self.memo = memo self.document = memo.document self.attach_observer(self.document.note_source) self.reporter = memo.reporter self.language = memo.language self.node = node results = StateMachineWS.run(self, input_lines, input_offset) assert results == [], ('NestedStateMachine.run() results should be ' 'empty!') return results class RSTState(StateWS): """ reStructuredText State superclass. Contains methods used by all State subclasses. """ nested_sm = NestedStateMachine nested_sm_cache = [] def __init__(self, state_machine, debug=False): self.nested_sm_kwargs = {'state_classes': state_classes, 'initial_state': 'Body'} StateWS.__init__(self, state_machine, debug) def runtime_init(self): StateWS.runtime_init(self) memo = self.state_machine.memo self.memo = memo self.reporter = memo.reporter self.inliner = memo.inliner self.document = memo.document self.parent = self.state_machine.node # enable the reporter to determine source and source-line if not hasattr(self.reporter, 'get_source_and_line'): self.reporter.get_source_and_line = self.state_machine.get_source_and_line # print "adding get_source_and_line to reporter", self.state_machine.input_offset def goto_line(self, abs_line_offset): """ Jump to input line `abs_line_offset`, ignoring jumps past the end. """ try: self.state_machine.goto_line(abs_line_offset) except EOFError: pass def no_match(self, context, transitions): """ Override `StateWS.no_match` to generate a system message. This code should never be run. """ self.reporter.severe( 'Internal error: no transition pattern match. State: "%s"; ' 'transitions: %s; context: %s; current line: %r.' % (self.__class__.__name__, transitions, context, self.state_machine.line)) return context, None, [] def bof(self, context): """Called at beginning of file.""" return [], [] def nested_parse(self, block, input_offset, node, match_titles=False, state_machine_class=None, state_machine_kwargs=None): """ Create a new StateMachine rooted at `node` and run it over the input `block`. """ use_default = 0 if state_machine_class is None: state_machine_class = self.nested_sm use_default += 1 if state_machine_kwargs is None: state_machine_kwargs = self.nested_sm_kwargs use_default += 1 block_length = len(block) state_machine = None if use_default == 2: try: state_machine = self.nested_sm_cache.pop() except IndexError: pass if not state_machine: state_machine = state_machine_class(debug=self.debug, **state_machine_kwargs) state_machine.run(block, input_offset, memo=self.memo, node=node, match_titles=match_titles) if use_default == 2: self.nested_sm_cache.append(state_machine) else: state_machine.unlink() new_offset = state_machine.abs_line_offset() # No `block.parent` implies disconnected -- lines aren't in sync: if block.parent and (len(block) - block_length) != 0: # Adjustment for block if modified in nested parse: self.state_machine.next_line(len(block) - block_length) return new_offset def nested_list_parse(self, block, input_offset, node, initial_state, blank_finish, blank_finish_state=None, extra_settings={}, match_titles=False, state_machine_class=None, state_machine_kwargs=None): """ Create a new StateMachine rooted at `node` and run it over the input `block`. Also keep track of optional intermediate blank lines and the required final one. """ if state_machine_class is None: state_machine_class = self.nested_sm if state_machine_kwargs is None: state_machine_kwargs = self.nested_sm_kwargs.copy() state_machine_kwargs['initial_state'] = initial_state state_machine = state_machine_class(debug=self.debug, **state_machine_kwargs) if blank_finish_state is None: blank_finish_state = initial_state state_machine.states[blank_finish_state].blank_finish = blank_finish for key, value in extra_settings.items(): setattr(state_machine.states[initial_state], key, value) state_machine.run(block, input_offset, memo=self.memo, node=node, match_titles=match_titles) blank_finish = state_machine.states[blank_finish_state].blank_finish state_machine.unlink() return state_machine.abs_line_offset(), blank_finish def section(self, title, source, style, lineno, messages): """Check for a valid subsection and create one if it checks out.""" if self.check_subsection(source, style, lineno): self.new_subsection(title, lineno, messages) def check_subsection(self, source, style, lineno): """ Check for a valid subsection header. Return 1 (true) or None (false). When a new section is reached that isn't a subsection of the current section, back up the line count (use ``previous_line(-x)``), then ``raise EOFError``. The current StateMachine will finish, then the calling StateMachine can re-examine the title. This will work its way back up the calling chain until the correct section level isreached. @@@ Alternative: Evaluate the title, store the title info & level, and back up the chain until that level is reached. Store in memo? Or return in results? :Exception: `EOFError` when a sibling or supersection encountered. """ memo = self.memo title_styles = memo.title_styles mylevel = memo.section_level try: # check for existing title style level = title_styles.index(style) + 1 except ValueError: # new title style if len(title_styles) == memo.section_level: # new subsection title_styles.append(style) return 1 else: # not at lowest level self.parent += self.title_inconsistent(source, lineno) return None if level <= mylevel: # sibling or supersection memo.section_level = level # bubble up to parent section if len(style) == 2: memo.section_bubble_up_kludge = True # back up 2 lines for underline title, 3 for overline title self.state_machine.previous_line(len(style) + 1) raise EOFError # let parent section re-evaluate if level == mylevel + 1: # immediate subsection return 1 else: # invalid subsection self.parent += self.title_inconsistent(source, lineno) return None def title_inconsistent(self, sourcetext, lineno): error = self.reporter.severe( 'Title level inconsistent:', nodes.literal_block('', sourcetext), line=lineno) return error def new_subsection(self, title, lineno, messages): """Append new subsection to document tree. On return, check level.""" memo = self.memo mylevel = memo.section_level memo.section_level += 1 section_node = nodes.section() self.parent += section_node textnodes, title_messages = self.inline_text(title, lineno) titlenode = nodes.title(title, '', *textnodes) name = normalize_name(titlenode.astext()) section_node['names'].append(name) section_node += titlenode section_node += messages section_node += title_messages self.document.note_implicit_target(section_node, section_node) offset = self.state_machine.line_offset + 1 absoffset = self.state_machine.abs_line_offset() + 1 newabsoffset = self.nested_parse( self.state_machine.input_lines[offset:], input_offset=absoffset, node=section_node, match_titles=True) self.goto_line(newabsoffset) if memo.section_level <= mylevel: # can't handle next section? raise EOFError # bubble up to supersection # reset section_level; next pass will detect it properly memo.section_level = mylevel def paragraph(self, lines, lineno): """ Return a list (paragraph & messages) & a boolean: literal_block next? """ data = '\n'.join(lines).rstrip() if re.search(r'(?%(or_group)s)%(suffix)s' % locals() if compile: return re.compile(regexp, re.UNICODE) else: return regexp class Inliner: """ Parse inline markup; call the `parse()` method. """ def __init__(self): self.implicit_dispatch = [(self.patterns.uri, self.standalone_uri),] """List of (pattern, bound method) tuples, used by `self.implicit_inline`.""" def init_customizations(self, settings): """Setting-based customizations; run when parsing begins.""" if settings.pep_references: self.implicit_dispatch.append((self.patterns.pep, self.pep_reference)) if settings.rfc_references: self.implicit_dispatch.append((self.patterns.rfc, self.rfc_reference)) def parse(self, text, lineno, memo, parent): # Needs to be refactored for nested inline markup. # Add nested_parse() method? """ Return 2 lists: nodes (text and inline elements), and system_messages. Using `self.patterns.initial`, a pattern which matches start-strings (emphasis, strong, interpreted, phrase reference, literal, substitution reference, and inline target) and complete constructs (simple reference, footnote reference), search for a candidate. When one is found, check for validity (e.g., not a quoted '*' character). If valid, search for the corresponding end string if applicable, and check it for validity. If not found or invalid, generate a warning and ignore the start-string. Implicit inline markup (e.g. standalone URIs) is found last. """ self.reporter = memo.reporter self.document = memo.document self.language = memo.language self.parent = parent pattern_search = self.patterns.initial.search dispatch = self.dispatch remaining = escape2null(text) processed = [] unprocessed = [] messages = [] while remaining: match = pattern_search(remaining) if match: groups = match.groupdict() method = dispatch[groups['start'] or groups['backquote'] or groups['refend'] or groups['fnend']] before, inlines, remaining, sysmessages = method(self, match, lineno) unprocessed.append(before) messages += sysmessages if inlines: processed += self.implicit_inline(''.join(unprocessed), lineno) processed += inlines unprocessed = [] else: break remaining = ''.join(unprocessed) + remaining if remaining: processed += self.implicit_inline(remaining, lineno) return processed, messages # Inline object recognition # ------------------------- # lookahead and look-behind expressions for inline markup rules start_string_prefix = (u'(^|(?<=\\s|[%s%s]))' % (punctuation_chars.openers, punctuation_chars.delimiters)) end_string_suffix = (u'($|(?=\\s|[\x00%s%s%s]))' % (punctuation_chars.closing_delimiters, punctuation_chars.delimiters, punctuation_chars.closers)) # print start_string_prefix.encode('utf8') # TODO: support non-ASCII whitespace in the following 4 patterns? non_whitespace_before = r'(?]""" # Last URI character; same as uric but no punctuation: urilast = r"""[_~*/=+a-zA-Z0-9]""" # End of a URI (either 'urilast' or 'uric followed by a # uri_end_delim'): uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals() emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]""" email_pattern = r""" %(emailc)s+(?:\.%(emailc)s+)* # name (?%s)(?P__?)' % simplename, ('footnotelabel', r'\[', r'(?P\]_)', [r'[0-9]+', # manually numbered r'\#(%s)?' % simplename, # auto-numbered (w/ label?) r'\*', # auto-symbol r'(?P%s)' % simplename] # citation reference ) ] ), ('backquote', # interpreted text or phrase reference '(?P(:%s:)?)' % simplename, # optional role non_whitespace_after, ['`(?!`)'] # but not literal ) ] ) patterns = Struct( initial=build_regexp(parts), emphasis=re.compile(non_whitespace_escape_before + r'(\*)' + end_string_suffix, re.UNICODE), strong=re.compile(non_whitespace_escape_before + r'(\*\*)' + end_string_suffix, re.UNICODE), interpreted_or_phrase_ref=re.compile( r""" %(non_unescaped_whitespace_escape_before)s ( ` (?P (?P:%(simplename)s:)? (?P__?)? ) ) %(end_string_suffix)s """ % locals(), re.VERBOSE | re.UNICODE), embedded_link=re.compile( r""" ( (?:[ \n]+|^) # spaces or beginning of line/string < # open bracket %(non_whitespace_after)s ([^<>\x00]+(\x00_)?) # anything but angle brackets & nulls # except escaped trailing low line %(non_whitespace_before)s > # close bracket w/o whitespace before ) $ # end of string """ % locals(), re.VERBOSE | re.UNICODE), literal=re.compile(non_whitespace_before + '(``)' + end_string_suffix), target=re.compile(non_whitespace_escape_before + r'(`)' + end_string_suffix), substitution_ref=re.compile(non_whitespace_escape_before + r'(\|_{0,2})' + end_string_suffix), email=re.compile(email_pattern % locals() + '$', re.VERBOSE | re.UNICODE), uri=re.compile( (r""" %(start_string_prefix)s (?P (?P # absolute URI (?P # scheme (http, ftp, mailto) [a-zA-Z][a-zA-Z0-9.+-]* ) : ( ( # either: (//?)? # hierarchical URI %(uric)s* # URI characters %(uri_end)s # final URI char ) ( # optional query \?%(uric)s* %(uri_end)s )? ( # optional fragment \#%(uric)s* %(uri_end)s )? ) ) | # *OR* (?P # email address """ + email_pattern + r""" ) ) %(end_string_suffix)s """) % locals(), re.VERBOSE | re.UNICODE), pep=re.compile( r""" %(start_string_prefix)s ( (pep-(?P\d+)(.txt)?) # reference to source file | (PEP\s+(?P\d+)) # reference by name ) %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE), rfc=re.compile( r""" %(start_string_prefix)s (RFC(-|\s+)?(?P\d+)) %(end_string_suffix)s""" % locals(), re.VERBOSE | re.UNICODE)) def quoted_start(self, match): """Test if inline markup start-string is 'quoted'. 'Quoted' in this context means the start-string is enclosed in a pair of matching opening/closing delimiters (not necessarily quotes) or at the end of the match. """ string = match.string start = match.start() if start == 0: # start-string at beginning of text return False prestart = string[start - 1] try: poststart = string[match.end()] except IndexError: # start-string at end of text return True # not "quoted" but no markup start-string either return punctuation_chars.match_chars(prestart, poststart) def inline_obj(self, match, lineno, end_pattern, nodeclass, restore_backslashes=False): string = match.string matchstart = match.start('start') matchend = match.end('start') if self.quoted_start(match): return (string[:matchend], [], string[matchend:], [], '') endmatch = end_pattern.search(string[matchend:]) if endmatch and endmatch.start(1): # 1 or more chars text = unescape(endmatch.string[:endmatch.start(1)], restore_backslashes) textend = matchend + endmatch.end(1) rawsource = unescape(string[matchstart:textend], 1) return (string[:matchstart], [nodeclass(rawsource, text)], string[textend:], [], endmatch.group(1)) msg = self.reporter.warning( 'Inline %s start-string without end-string.' % nodeclass.__name__, line=lineno) text = unescape(string[matchstart:matchend], 1) rawsource = unescape(string[matchstart:matchend], 1) prb = self.problematic(text, rawsource, msg) return string[:matchstart], [prb], string[matchend:], [msg], '' def problematic(self, text, rawsource, message): msgid = self.document.set_id(message, self.parent) problematic = nodes.problematic(rawsource, text, refid=msgid) prbid = self.document.set_id(problematic) message.add_backref(prbid) return problematic def emphasis(self, match, lineno): before, inlines, remaining, sysmessages, endstring = self.inline_obj( match, lineno, self.patterns.emphasis, nodes.emphasis) return before, inlines, remaining, sysmessages def strong(self, match, lineno): before, inlines, remaining, sysmessages, endstring = self.inline_obj( match, lineno, self.patterns.strong, nodes.strong) return before, inlines, remaining, sysmessages def interpreted_or_phrase_ref(self, match, lineno): end_pattern = self.patterns.interpreted_or_phrase_ref string = match.string matchstart = match.start('backquote') matchend = match.end('backquote') rolestart = match.start('role') role = match.group('role') position = '' if role: role = role[1:-1] position = 'prefix' elif self.quoted_start(match): return (string[:matchend], [], string[matchend:], []) endmatch = end_pattern.search(string[matchend:]) if endmatch and endmatch.start(1): # 1 or more chars textend = matchend + endmatch.end() if endmatch.group('role'): if role: msg = self.reporter.warning( 'Multiple roles in interpreted text (both ' 'prefix and suffix present; only one allowed).', line=lineno) text = unescape(string[rolestart:textend], 1) prb = self.problematic(text, text, msg) return string[:rolestart], [prb], string[textend:], [msg] role = endmatch.group('suffix')[1:-1] position = 'suffix' escaped = endmatch.string[:endmatch.start(1)] rawsource = unescape(string[matchstart:textend], 1) if rawsource[-1:] == '_': if role: msg = self.reporter.warning( 'Mismatch: both interpreted text role %s and ' 'reference suffix.' % position, line=lineno) text = unescape(string[rolestart:textend], 1) prb = self.problematic(text, text, msg) return string[:rolestart], [prb], string[textend:], [msg] return self.phrase_ref(string[:matchstart], string[textend:], rawsource, escaped, unescape(escaped)) else: rawsource = unescape(string[rolestart:textend], 1) nodelist, messages = self.interpreted(rawsource, escaped, role, lineno) return (string[:rolestart], nodelist, string[textend:], messages) msg = self.reporter.warning( 'Inline interpreted text or phrase reference start-string ' 'without end-string.', line=lineno) text = unescape(string[matchstart:matchend], 1) prb = self.problematic(text, text, msg) return string[:matchstart], [prb], string[matchend:], [msg] def phrase_ref(self, before, after, rawsource, escaped, text): match = self.patterns.embedded_link.search(escaped) if match: # embedded or text = unescape(escaped[:match.start(0)]) aliastext = unescape(match.group(2), restore_backslashes=True) if aliastext.endswith('_') and not (aliastext.endswith(r'\_') or self.patterns.uri.match(aliastext)): aliastype = 'name' alias = normalize_name(aliastext[:-1]) target = nodes.target(match.group(1), refname=alias) target.indirect_reference_name = aliastext[:-1] else: aliastype = 'uri' alias = ''.join(aliastext.split()) alias = self.adjust_uri(alias) if alias.endswith(r'\_'): alias = alias[:-2] + '_' target = nodes.target(match.group(1), refuri=alias) target.referenced = 1 if not aliastext: raise ApplicationError('problem with embedded link: %r' % aliastext) if not text: text = alias else: target = None refname = normalize_name(text) reference = nodes.reference(rawsource, text, name=whitespace_normalize_name(text)) node_list = [reference] if rawsource[-2:] == '__': if target and (aliastype == 'name'): reference['refname'] = alias self.document.note_refname(reference) # self.document.note_indirect_target(target) # required? elif target and (aliastype == 'uri'): reference['refuri'] = alias else: reference['anonymous'] = 1 else: if target: target['names'].append(refname) if aliastype == 'name': reference['refname'] = alias self.document.note_indirect_target(target) self.document.note_refname(reference) else: reference['refuri'] = alias self.document.note_explicit_target(target, self.parent) # target.note_referenced_by(name=refname) node_list.append(target) else: reference['refname'] = refname self.document.note_refname(reference) return before, node_list, after, [] def adjust_uri(self, uri): match = self.patterns.email.match(uri) if match: return 'mailto:' + uri else: return uri def interpreted(self, rawsource, text, role, lineno): role_fn, messages = roles.role(role, self.language, lineno, self.reporter) if role_fn: nodes, messages2 = role_fn(role, rawsource, text, lineno, self) return nodes, messages + messages2 else: msg = self.reporter.error( 'Unknown interpreted text role "%s".' % role, line=lineno) return ([self.problematic(rawsource, rawsource, msg)], messages + [msg]) def literal(self, match, lineno): before, inlines, remaining, sysmessages, endstring = self.inline_obj( match, lineno, self.patterns.literal, nodes.literal, restore_backslashes=True) return before, inlines, remaining, sysmessages def inline_internal_target(self, match, lineno): before, inlines, remaining, sysmessages, endstring = self.inline_obj( match, lineno, self.patterns.target, nodes.target) if inlines and isinstance(inlines[0], nodes.target): assert len(inlines) == 1 target = inlines[0] name = normalize_name(target.astext()) target['names'].append(name) self.document.note_explicit_target(target, self.parent) return before, inlines, remaining, sysmessages def substitution_reference(self, match, lineno): before, inlines, remaining, sysmessages, endstring = self.inline_obj( match, lineno, self.patterns.substitution_ref, nodes.substitution_reference) if len(inlines) == 1: subref_node = inlines[0] if isinstance(subref_node, nodes.substitution_reference): subref_text = subref_node.astext() self.document.note_substitution_ref(subref_node, subref_text) if endstring[-1:] == '_': reference_node = nodes.reference( '|%s%s' % (subref_text, endstring), '') if endstring[-2:] == '__': reference_node['anonymous'] = 1 else: reference_node['refname'] = normalize_name(subref_text) self.document.note_refname(reference_node) reference_node += subref_node inlines = [reference_node] return before, inlines, remaining, sysmessages def footnote_reference(self, match, lineno): """ Handles `nodes.footnote_reference` and `nodes.citation_reference` elements. """ label = match.group('footnotelabel') refname = normalize_name(label) string = match.string before = string[:match.start('whole')] remaining = string[match.end('whole'):] if match.group('citationlabel'): refnode = nodes.citation_reference('[%s]_' % label, refname=refname) refnode += nodes.Text(label) self.document.note_citation_ref(refnode) else: refnode = nodes.footnote_reference('[%s]_' % label) if refname[0] == '#': refname = refname[1:] refnode['auto'] = 1 self.document.note_autofootnote_ref(refnode) elif refname == '*': refname = '' refnode['auto'] = '*' self.document.note_symbol_footnote_ref( refnode) else: refnode += nodes.Text(label) if refname: refnode['refname'] = refname self.document.note_footnote_ref(refnode) if utils.get_trim_footnote_ref_space(self.document.settings): before = before.rstrip() return (before, [refnode], remaining, []) def reference(self, match, lineno, anonymous=False): referencename = match.group('refname') refname = normalize_name(referencename) referencenode = nodes.reference( referencename + match.group('refend'), referencename, name=whitespace_normalize_name(referencename)) if anonymous: referencenode['anonymous'] = 1 else: referencenode['refname'] = refname self.document.note_refname(referencenode) string = match.string matchstart = match.start('whole') matchend = match.end('whole') return (string[:matchstart], [referencenode], string[matchend:], []) def anonymous_reference(self, match, lineno): return self.reference(match, lineno, anonymous=1) def standalone_uri(self, match, lineno): if (not match.group('scheme') or match.group('scheme').lower() in urischemes.schemes): if match.group('email'): addscheme = 'mailto:' else: addscheme = '' text = match.group('whole') unescaped = unescape(text, 0) return [nodes.reference(unescape(text, 1), unescaped, refuri=addscheme + unescaped)] else: # not a valid scheme raise MarkupMismatch def pep_reference(self, match, lineno): text = match.group(0) if text.startswith('pep-'): pepnum = int(match.group('pepnum1')) elif text.startswith('PEP'): pepnum = int(match.group('pepnum2')) else: raise MarkupMismatch ref = (self.document.settings.pep_base_url + self.document.settings.pep_file_url_template % pepnum) unescaped = unescape(text, 0) return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)] rfc_url = 'rfc%d.html' def rfc_reference(self, match, lineno): text = match.group(0) if text.startswith('RFC'): rfcnum = int(match.group('rfcnum')) ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum else: raise MarkupMismatch unescaped = unescape(text, 0) return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)] def implicit_inline(self, text, lineno): """ Check each of the patterns in `self.implicit_dispatch` for a match, and dispatch to the stored method for the pattern. Recursively check the text before and after the match. Return a list of `nodes.Text` and inline element nodes. """ if not text: return [] for pattern, method in self.implicit_dispatch: match = pattern.search(text) if match: try: # Must recurse on strings before *and* after the match; # there may be multiple patterns. return (self.implicit_inline(text[:match.start()], lineno) + method(match, lineno) + self.implicit_inline(text[match.end():], lineno)) except MarkupMismatch: pass return [nodes.Text(unescape(text), rawsource=unescape(text, 1))] dispatch = {'*': emphasis, '**': strong, '`': interpreted_or_phrase_ref, '``': literal, '_`': inline_internal_target, ']_': footnote_reference, '|': substitution_reference, '_': reference, '__': anonymous_reference} def _loweralpha_to_int(s, _zero=(ord('a')-1)): return ord(s) - _zero def _upperalpha_to_int(s, _zero=(ord('A')-1)): return ord(s) - _zero def _lowerroman_to_int(s): return roman.fromRoman(s.upper()) class Body(RSTState): """ Generic classifier of the first line of a block. """ double_width_pad_char = tableparser.TableParser.double_width_pad_char """Padding character for East Asian double-width text.""" enum = Struct() """Enumerated list parsing information.""" enum.formatinfo = { 'parens': Struct(prefix='(', suffix=')', start=1, end=-1), 'rparen': Struct(prefix='', suffix=')', start=0, end=-1), 'period': Struct(prefix='', suffix='.', start=0, end=-1)} enum.formats = enum.formatinfo.keys() enum.sequences = ['arabic', 'loweralpha', 'upperalpha', 'lowerroman', 'upperroman'] # ORDERED! enum.sequencepats = {'arabic': '[0-9]+', 'loweralpha': '[a-z]', 'upperalpha': '[A-Z]', 'lowerroman': '[ivxlcdm]+', 'upperroman': '[IVXLCDM]+',} enum.converters = {'arabic': int, 'loweralpha': _loweralpha_to_int, 'upperalpha': _upperalpha_to_int, 'lowerroman': _lowerroman_to_int, 'upperroman': roman.fromRoman} enum.sequenceregexps = {} for sequence in enum.sequences: enum.sequenceregexps[sequence] = re.compile( enum.sequencepats[sequence] + '$', re.UNICODE) grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$') """Matches the top (& bottom) of a full table).""" simple_table_top_pat = re.compile('=+( +=+)+ *$') """Matches the top of a simple table.""" simple_table_border_pat = re.compile('=+[ =]*$') """Matches the bottom & header bottom of a simple table.""" pats = {} """Fragments of patterns used by transitions.""" pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]' pats['alpha'] = '[a-zA-Z]' pats['alphanum'] = '[a-zA-Z0-9]' pats['alphanumplus'] = '[a-zA-Z0-9_-]' pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s' '|%(upperroman)s|#)' % enum.sequencepats) pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats # @@@ Loosen up the pattern? Allow Unicode? pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats for format in enum.formats: pats[format] = '(?P<%s>%s%s%s)' % ( format, re.escape(enum.formatinfo[format].prefix), pats['enum'], re.escape(enum.formatinfo[format].suffix)) patterns = { 'bullet': u'[-+*\u2022\u2023\u2043]( +|$)', 'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats, 'field_marker': r':(?![: ])([^:\\]|\\.)*(?>>( +|$)', 'line_block': r'\|( +|$)', 'grid_table_top': grid_table_top_pat, 'simple_table_top': simple_table_top_pat, 'explicit_markup': r'\.\.( +|$)', 'anonymous': r'__( +|$)', 'line': r'(%(nonalphanum7bit)s)\1* *$' % pats, 'text': r''} initial_transitions = ( 'bullet', 'enumerator', 'field_marker', 'option_marker', 'doctest', 'line_block', 'grid_table_top', 'simple_table_top', 'explicit_markup', 'anonymous', 'line', 'text') def indent(self, match, context, next_state): """Block quote.""" indented, indent, line_offset, blank_finish = \ self.state_machine.get_indented() elements = self.block_quote(indented, line_offset) self.parent += elements if not blank_finish: self.parent += self.unindent_warning('Block quote') return context, next_state, [] def block_quote(self, indented, line_offset): elements = [] while indented: (blockquote_lines, attribution_lines, attribution_offset, indented, new_line_offset) = self.split_attribution(indented, line_offset) blockquote = nodes.block_quote() self.nested_parse(blockquote_lines, line_offset, blockquote) elements.append(blockquote) if attribution_lines: attribution, messages = self.parse_attribution( attribution_lines, attribution_offset) blockquote += attribution elements += messages line_offset = new_line_offset while indented and not indented[0]: indented = indented[1:] line_offset += 1 return elements # U+2014 is an em-dash: attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])', re.UNICODE) def split_attribution(self, indented, line_offset): """ Check for a block quote attribution and split it off: * First line after a blank line must begin with a dash ("--", "---", em-dash; matches `self.attribution_pattern`). * Every line after that must have consistent indentation. * Attributions must be preceded by block quote content. Return a tuple of: (block quote content lines, content offset, attribution lines, attribution offset, remaining indented lines). """ blank = None nonblank_seen = False for i in range(len(indented)): line = indented[i].rstrip() if line: if nonblank_seen and blank == i - 1: # last line blank match = self.attribution_pattern.match(line) if match: attribution_end, indent = self.check_attribution( indented, i) if attribution_end: a_lines = indented[i:attribution_end] a_lines.trim_left(match.end(), end=1) a_lines.trim_left(indent, start=1) return (indented[:i], a_lines, i, indented[attribution_end:], line_offset + attribution_end) nonblank_seen = True else: blank = i else: return (indented, None, None, None, None) def check_attribution(self, indented, attribution_start): """ Check attribution shape. Return the index past the end of the attribution, and the indent. """ indent = None i = attribution_start + 1 for i in range(attribution_start + 1, len(indented)): line = indented[i].rstrip() if not line: break if indent is None: indent = len(line) - len(line.lstrip()) elif len(line) - len(line.lstrip()) != indent: return None, None # bad shape; not an attribution else: # return index of line after last attribution line: i += 1 return i, (indent or 0) def parse_attribution(self, indented, line_offset): text = '\n'.join(indented).rstrip() lineno = self.state_machine.abs_line_number() + line_offset textnodes, messages = self.inline_text(text, lineno) node = nodes.attribution(text, '', *textnodes) node.source, node.line = self.state_machine.get_source_and_line(lineno) return node, messages def bullet(self, match, context, next_state): """Bullet list item.""" bulletlist = nodes.bullet_list() self.parent += bulletlist bulletlist['bullet'] = match.string[0] i, blank_finish = self.list_item(match.end()) bulletlist += i offset = self.state_machine.line_offset + 1 # next line new_line_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=bulletlist, initial_state='BulletList', blank_finish=blank_finish) self.goto_line(new_line_offset) if not blank_finish: self.parent += self.unindent_warning('Bullet list') return [], next_state, [] def list_item(self, indent): if self.state_machine.line[indent:]: indented, line_offset, blank_finish = ( self.state_machine.get_known_indented(indent)) else: indented, indent, line_offset, blank_finish = ( self.state_machine.get_first_known_indented(indent)) listitem = nodes.list_item('\n'.join(indented)) if indented: self.nested_parse(indented, input_offset=line_offset, node=listitem) return listitem, blank_finish def enumerator(self, match, context, next_state): """Enumerated List Item""" format, sequence, text, ordinal = self.parse_enumerator(match) if not self.is_enumerated_list_item(ordinal, sequence, format): raise statemachine.TransitionCorrection('text') enumlist = nodes.enumerated_list() self.parent += enumlist if sequence == '#': enumlist['enumtype'] = 'arabic' else: enumlist['enumtype'] = sequence enumlist['prefix'] = self.enum.formatinfo[format].prefix enumlist['suffix'] = self.enum.formatinfo[format].suffix if ordinal != 1: enumlist['start'] = ordinal msg = self.reporter.info( 'Enumerated list start value not ordinal-1: "%s" (ordinal %s)' % (text, ordinal)) self.parent += msg listitem, blank_finish = self.list_item(match.end()) enumlist += listitem offset = self.state_machine.line_offset + 1 # next line newline_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=enumlist, initial_state='EnumeratedList', blank_finish=blank_finish, extra_settings={'lastordinal': ordinal, 'format': format, 'auto': sequence == '#'}) self.goto_line(newline_offset) if not blank_finish: self.parent += self.unindent_warning('Enumerated list') return [], next_state, [] def parse_enumerator(self, match, expected_sequence=None): """ Analyze an enumerator and return the results. :Return: - the enumerator format ('period', 'parens', or 'rparen'), - the sequence used ('arabic', 'loweralpha', 'upperroman', etc.), - the text of the enumerator, stripped of formatting, and - the ordinal value of the enumerator ('a' -> 1, 'ii' -> 2, etc.; ``None`` is returned for invalid enumerator text). The enumerator format has already been determined by the regular expression match. If `expected_sequence` is given, that sequence is tried first. If not, we check for Roman numeral 1. This way, single-character Roman numerals (which are also alphabetical) can be matched. If no sequence has been matched, all sequences are checked in order. """ groupdict = match.groupdict() sequence = '' for format in self.enum.formats: if groupdict[format]: # was this the format matched? break # yes; keep `format` else: # shouldn't happen raise ParserError('enumerator format not matched') text = groupdict[format][self.enum.formatinfo[format].start :self.enum.formatinfo[format].end] if text == '#': sequence = '#' elif expected_sequence: try: if self.enum.sequenceregexps[expected_sequence].match(text): sequence = expected_sequence except KeyError: # shouldn't happen raise ParserError('unknown enumerator sequence: %s' % sequence) elif text == 'i': sequence = 'lowerroman' elif text == 'I': sequence = 'upperroman' if not sequence: for sequence in self.enum.sequences: if self.enum.sequenceregexps[sequence].match(text): break else: # shouldn't happen raise ParserError('enumerator sequence not matched') if sequence == '#': ordinal = 1 else: try: ordinal = self.enum.converters[sequence](text) except roman.InvalidRomanNumeralError: ordinal = None return format, sequence, text, ordinal def is_enumerated_list_item(self, ordinal, sequence, format): """ Check validity based on the ordinal value and the second line. Return true if the ordinal is valid and the second line is blank, indented, or starts with the next enumerator or an auto-enumerator. """ if ordinal is None: return None try: next_line = self.state_machine.next_line() except EOFError: # end of input lines self.state_machine.previous_line() return 1 else: self.state_machine.previous_line() if not next_line[:1].strip(): # blank or indented return 1 result = self.make_enumerator(ordinal + 1, sequence, format) if result: next_enumerator, auto_enumerator = result try: if ( next_line.startswith(next_enumerator) or next_line.startswith(auto_enumerator) ): return 1 except TypeError: pass return None def make_enumerator(self, ordinal, sequence, format): """ Construct and return the next enumerated list item marker, and an auto-enumerator ("#" instead of the regular enumerator). Return ``None`` for invalid (out of range) ordinals. """ #" if sequence == '#': enumerator = '#' elif sequence == 'arabic': enumerator = str(ordinal) else: if sequence.endswith('alpha'): if ordinal > 26: return None enumerator = chr(ordinal + ord('a') - 1) elif sequence.endswith('roman'): try: enumerator = roman.toRoman(ordinal) except roman.RomanError: return None else: # shouldn't happen raise ParserError('unknown enumerator sequence: "%s"' % sequence) if sequence.startswith('lower'): enumerator = enumerator.lower() elif sequence.startswith('upper'): enumerator = enumerator.upper() else: # shouldn't happen raise ParserError('unknown enumerator sequence: "%s"' % sequence) formatinfo = self.enum.formatinfo[format] next_enumerator = (formatinfo.prefix + enumerator + formatinfo.suffix + ' ') auto_enumerator = formatinfo.prefix + '#' + formatinfo.suffix + ' ' return next_enumerator, auto_enumerator def field_marker(self, match, context, next_state): """Field list item.""" field_list = nodes.field_list() self.parent += field_list field, blank_finish = self.field(match) field_list += field offset = self.state_machine.line_offset + 1 # next line newline_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=field_list, initial_state='FieldList', blank_finish=blank_finish) self.goto_line(newline_offset) if not blank_finish: self.parent += self.unindent_warning('Field list') return [], next_state, [] def field(self, match): name = self.parse_field_marker(match) src, srcline = self.state_machine.get_source_and_line() lineno = self.state_machine.abs_line_number() indented, indent, line_offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end()) field_node = nodes.field() field_node.source = src field_node.line = srcline name_nodes, name_messages = self.inline_text(name, lineno) field_node += nodes.field_name(name, '', *name_nodes) field_body = nodes.field_body('\n'.join(indented), *name_messages) field_node += field_body if indented: self.parse_field_body(indented, line_offset, field_body) return field_node, blank_finish def parse_field_marker(self, match): """Extract & return field name from a field marker match.""" field = match.group()[1:] # strip off leading ':' field = field[:field.rfind(':')] # strip off trailing ':' etc. return field def parse_field_body(self, indented, offset, node): self.nested_parse(indented, input_offset=offset, node=node) def option_marker(self, match, context, next_state): """Option list item.""" optionlist = nodes.option_list() try: listitem, blank_finish = self.option_list_item(match) except MarkupError, error: # This shouldn't happen; pattern won't match. msg = self.reporter.error(u'Invalid option list marker: %s' % error) self.parent += msg indented, indent, line_offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end()) elements = self.block_quote(indented, line_offset) self.parent += elements if not blank_finish: self.parent += self.unindent_warning('Option list') return [], next_state, [] self.parent += optionlist optionlist += listitem offset = self.state_machine.line_offset + 1 # next line newline_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=optionlist, initial_state='OptionList', blank_finish=blank_finish) self.goto_line(newline_offset) if not blank_finish: self.parent += self.unindent_warning('Option list') return [], next_state, [] def option_list_item(self, match): offset = self.state_machine.abs_line_offset() options = self.parse_option_marker(match) indented, indent, line_offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end()) if not indented: # not an option list item self.goto_line(offset) raise statemachine.TransitionCorrection('text') option_group = nodes.option_group('', *options) description = nodes.description('\n'.join(indented)) option_list_item = nodes.option_list_item('', option_group, description) if indented: self.nested_parse(indented, input_offset=line_offset, node=description) return option_list_item, blank_finish def parse_option_marker(self, match): """ Return a list of `node.option` and `node.option_argument` objects, parsed from an option marker match. :Exception: `MarkupError` for invalid option markers. """ optlist = [] optionstrings = match.group().rstrip().split(', ') for optionstring in optionstrings: tokens = optionstring.split() delimiter = ' ' firstopt = tokens[0].split('=', 1) if len(firstopt) > 1: # "--opt=value" form tokens[:1] = firstopt delimiter = '=' elif (len(tokens[0]) > 2 and ((tokens[0].startswith('-') and not tokens[0].startswith('--')) or tokens[0].startswith('+'))): # "-ovalue" form tokens[:1] = [tokens[0][:2], tokens[0][2:]] delimiter = '' if len(tokens) > 1 and (tokens[1].startswith('<') and tokens[-1].endswith('>')): # "-o " form; join all values into one token tokens[1:] = [' '.join(tokens[1:])] if 0 < len(tokens) <= 2: option = nodes.option(optionstring) option += nodes.option_string(tokens[0], tokens[0]) if len(tokens) > 1: option += nodes.option_argument(tokens[1], tokens[1], delimiter=delimiter) optlist.append(option) else: raise MarkupError( 'wrong number of option tokens (=%s), should be 1 or 2: ' '"%s"' % (len(tokens), optionstring)) return optlist def doctest(self, match, context, next_state): data = '\n'.join(self.state_machine.get_text_block()) self.parent += nodes.doctest_block(data, data) return [], next_state, [] def line_block(self, match, context, next_state): """First line of a line block.""" block = nodes.line_block() self.parent += block lineno = self.state_machine.abs_line_number() line, messages, blank_finish = self.line_block_line(match, lineno) block += line self.parent += messages if not blank_finish: offset = self.state_machine.line_offset + 1 # next line new_line_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=block, initial_state='LineBlock', blank_finish=0) self.goto_line(new_line_offset) if not blank_finish: self.parent += self.reporter.warning( 'Line block ends without a blank line.', line=lineno+1) if len(block): if block[0].indent is None: block[0].indent = 0 self.nest_line_block_lines(block) return [], next_state, [] def line_block_line(self, match, lineno): """Return one line element of a line_block.""" indented, indent, line_offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end(), until_blank=True) text = u'\n'.join(indented) text_nodes, messages = self.inline_text(text, lineno) line = nodes.line(text, '', *text_nodes) if match.string.rstrip() != '|': # not empty line.indent = len(match.group(1)) - 1 return line, messages, blank_finish def nest_line_block_lines(self, block): for index in range(1, len(block)): if getattr(block[index], 'indent', None) is None: block[index].indent = block[index - 1].indent self.nest_line_block_segment(block) def nest_line_block_segment(self, block): indents = [item.indent for item in block] least = min(indents) new_items = [] new_block = nodes.line_block() for item in block: if item.indent > least: new_block.append(item) else: if len(new_block): self.nest_line_block_segment(new_block) new_items.append(new_block) new_block = nodes.line_block() new_items.append(item) if len(new_block): self.nest_line_block_segment(new_block) new_items.append(new_block) block[:] = new_items def grid_table_top(self, match, context, next_state): """Top border of a full table.""" return self.table_top(match, context, next_state, self.isolate_grid_table, tableparser.GridTableParser) def simple_table_top(self, match, context, next_state): """Top border of a simple table.""" return self.table_top(match, context, next_state, self.isolate_simple_table, tableparser.SimpleTableParser) def table_top(self, match, context, next_state, isolate_function, parser_class): """Top border of a generic table.""" nodelist, blank_finish = self.table(isolate_function, parser_class) self.parent += nodelist if not blank_finish: msg = self.reporter.warning( 'Blank line required after table.', line=self.state_machine.abs_line_number()+1) self.parent += msg return [], next_state, [] def table(self, isolate_function, parser_class): """Parse a table.""" block, messages, blank_finish = isolate_function() if block: try: parser = parser_class() tabledata = parser.parse(block) tableline = (self.state_machine.abs_line_number() - len(block) + 1) table = self.build_table(tabledata, tableline) nodelist = [table] + messages except tableparser.TableMarkupError, err: nodelist = self.malformed_table(block, ' '.join(err.args), offset=err.offset) + messages else: nodelist = messages return nodelist, blank_finish def isolate_grid_table(self): messages = [] blank_finish = 1 try: block = self.state_machine.get_text_block(flush_left=True) except statemachine.UnexpectedIndentationError, err: block, src, srcline = err.args messages.append(self.reporter.error('Unexpected indentation.', source=src, line=srcline)) blank_finish = 0 block.disconnect() # for East Asian chars: block.pad_double_width(self.double_width_pad_char) width = len(block[0].strip()) for i in range(len(block)): block[i] = block[i].strip() if block[i][0] not in '+|': # check left edge blank_finish = 0 self.state_machine.previous_line(len(block) - i) del block[i:] break if not self.grid_table_top_pat.match(block[-1]): # find bottom blank_finish = 0 # from second-last to third line of table: for i in range(len(block) - 2, 1, -1): if self.grid_table_top_pat.match(block[i]): self.state_machine.previous_line(len(block) - i + 1) del block[i+1:] break else: messages.extend(self.malformed_table(block)) return [], messages, blank_finish for i in range(len(block)): # check right edge if len(block[i]) != width or block[i][-1] not in '+|': messages.extend(self.malformed_table(block)) return [], messages, blank_finish return block, messages, blank_finish def isolate_simple_table(self): start = self.state_machine.line_offset lines = self.state_machine.input_lines limit = len(lines) - 1 toplen = len(lines[start].strip()) pattern_match = self.simple_table_border_pat.match found = 0 found_at = None i = start + 1 while i <= limit: line = lines[i] match = pattern_match(line) if match: if len(line.strip()) != toplen: self.state_machine.next_line(i - start) messages = self.malformed_table( lines[start:i+1], 'Bottom/header table border does ' 'not match top border.') return [], messages, i == limit or not lines[i+1].strip() found += 1 found_at = i if found == 2 or i == limit or not lines[i+1].strip(): end = i break i += 1 else: # reached end of input_lines if found: extra = ' or no blank line after table bottom' self.state_machine.next_line(found_at - start) block = lines[start:found_at+1] else: extra = '' self.state_machine.next_line(i - start - 1) block = lines[start:] messages = self.malformed_table( block, 'No bottom table border found%s.' % extra) return [], messages, not extra self.state_machine.next_line(end - start) block = lines[start:end+1] # for East Asian chars: block.pad_double_width(self.double_width_pad_char) return block, [], end == limit or not lines[end+1].strip() def malformed_table(self, block, detail='', offset=0): block.replace(self.double_width_pad_char, '') data = '\n'.join(block) message = 'Malformed table.' startline = self.state_machine.abs_line_number() - len(block) + 1 if detail: message += '\n' + detail error = self.reporter.error(message, nodes.literal_block(data, data), line=startline+offset) return [error] def build_table(self, tabledata, tableline, stub_columns=0): colwidths, headrows, bodyrows = tabledata table = nodes.table() tgroup = nodes.tgroup(cols=len(colwidths)) table += tgroup for colwidth in colwidths: colspec = nodes.colspec(colwidth=colwidth) if stub_columns: colspec.attributes['stub'] = 1 stub_columns -= 1 tgroup += colspec if headrows: thead = nodes.thead() tgroup += thead for row in headrows: thead += self.build_table_row(row, tableline) tbody = nodes.tbody() tgroup += tbody for row in bodyrows: tbody += self.build_table_row(row, tableline) return table def build_table_row(self, rowdata, tableline): row = nodes.row() for cell in rowdata: if cell is None: continue morerows, morecols, offset, cellblock = cell attributes = {} if morerows: attributes['morerows'] = morerows if morecols: attributes['morecols'] = morecols entry = nodes.entry(**attributes) row += entry if ''.join(cellblock): self.nested_parse(cellblock, input_offset=tableline+offset, node=entry) return row explicit = Struct() """Patterns and constants used for explicit markup recognition.""" explicit.patterns = Struct( target=re.compile(r""" ( _ # anonymous target | # *OR* (?!_) # no underscore at the beginning (?P`?) # optional open quote (?![ `]) # first char. not space or # backquote (?P # reference name .+? ) %(non_whitespace_escape_before)s (?P=quote) # close quote if open quote used ) (?%(simplename)s)_ | # *OR* ` # open backquote (?![ ]) # not space (?P.+?) # hyperlink phrase %(non_whitespace_escape_before)s `_ # close backquote, # reference mark ) $ # end of string """ % vars(Inliner), re.VERBOSE | re.UNICODE), substitution=re.compile(r""" ( (?![ ]) # first char. not space (?P.+?) # substitution text %(non_whitespace_escape_before)s \| # close delimiter ) ([ ]+|$) # followed by whitespace """ % vars(Inliner), re.VERBOSE | re.UNICODE),) def footnote(self, match): src, srcline = self.state_machine.get_source_and_line() indented, indent, offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end()) label = match.group(1) name = normalize_name(label) footnote = nodes.footnote('\n'.join(indented)) footnote.source = src footnote.line = srcline if name[0] == '#': # auto-numbered name = name[1:] # autonumber label footnote['auto'] = 1 if name: footnote['names'].append(name) self.document.note_autofootnote(footnote) elif name == '*': # auto-symbol name = '' footnote['auto'] = '*' self.document.note_symbol_footnote(footnote) else: # manually numbered footnote += nodes.label('', label) footnote['names'].append(name) self.document.note_footnote(footnote) if name: self.document.note_explicit_target(footnote, footnote) else: self.document.set_id(footnote, footnote) if indented: self.nested_parse(indented, input_offset=offset, node=footnote) return [footnote], blank_finish def citation(self, match): src, srcline = self.state_machine.get_source_and_line() indented, indent, offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end()) label = match.group(1) name = normalize_name(label) citation = nodes.citation('\n'.join(indented)) citation.source = src citation.line = srcline citation += nodes.label('', label) citation['names'].append(name) self.document.note_citation(citation) self.document.note_explicit_target(citation, citation) if indented: self.nested_parse(indented, input_offset=offset, node=citation) return [citation], blank_finish def hyperlink_target(self, match): pattern = self.explicit.patterns.target lineno = self.state_machine.abs_line_number() block, indent, offset, blank_finish = \ self.state_machine.get_first_known_indented( match.end(), until_blank=True, strip_indent=False) blocktext = match.string[:match.end()] + '\n'.join(block) block = [escape2null(line) for line in block] escaped = block[0] blockindex = 0 while True: targetmatch = pattern.match(escaped) if targetmatch: break blockindex += 1 try: escaped += block[blockindex] except IndexError: raise MarkupError('malformed hyperlink target.') del block[:blockindex] block[0] = (block[0] + ' ')[targetmatch.end()-len(escaped)-1:].strip() target = self.make_target(block, blocktext, lineno, targetmatch.group('name')) return [target], blank_finish def make_target(self, block, block_text, lineno, target_name): target_type, data = self.parse_target(block, block_text, lineno) if target_type == 'refname': target = nodes.target(block_text, '', refname=normalize_name(data)) target.indirect_reference_name = data self.add_target(target_name, '', target, lineno) self.document.note_indirect_target(target) return target elif target_type == 'refuri': target = nodes.target(block_text, '') self.add_target(target_name, data, target, lineno) return target else: return data def parse_target(self, block, block_text, lineno): """ Determine the type of reference of a target. :Return: A 2-tuple, one of: - 'refname' and the indirect reference name - 'refuri' and the URI - 'malformed' and a system_message node """ if block and block[-1].strip()[-1:] == '_': # possible indirect target reference = ' '.join([line.strip() for line in block]) refname = self.is_reference(reference) if refname: return 'refname', refname reference = ''.join([''.join(line.split()) for line in block]) return 'refuri', unescape(reference) def is_reference(self, reference): match = self.explicit.patterns.reference.match( whitespace_normalize_name(reference)) if not match: return None return unescape(match.group('simple') or match.group('phrase')) def add_target(self, targetname, refuri, target, lineno): target.line = lineno if targetname: name = normalize_name(unescape(targetname)) target['names'].append(name) if refuri: uri = self.inliner.adjust_uri(refuri) if uri: target['refuri'] = uri else: raise ApplicationError('problem with URI: %r' % refuri) self.document.note_explicit_target(target, self.parent) else: # anonymous target if refuri: target['refuri'] = refuri target['anonymous'] = 1 self.document.note_anonymous_target(target) def substitution_def(self, match): pattern = self.explicit.patterns.substitution src, srcline = self.state_machine.get_source_and_line() block, indent, offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end(), strip_indent=False) blocktext = (match.string[:match.end()] + '\n'.join(block)) block.disconnect() escaped = escape2null(block[0].rstrip()) blockindex = 0 while True: subdefmatch = pattern.match(escaped) if subdefmatch: break blockindex += 1 try: escaped = escaped + ' ' + escape2null(block[blockindex].strip()) except IndexError: raise MarkupError('malformed substitution definition.') del block[:blockindex] # strip out the substitution marker block[0] = (block[0].strip() + ' ')[subdefmatch.end()-len(escaped)-1:-1] if not block[0]: del block[0] offset += 1 while block and not block[-1].strip(): block.pop() subname = subdefmatch.group('name') substitution_node = nodes.substitution_definition(blocktext) substitution_node.source = src substitution_node.line = srcline if not block: msg = self.reporter.warning( 'Substitution definition "%s" missing contents.' % subname, nodes.literal_block(blocktext, blocktext), source=src, line=srcline) return [msg], blank_finish block[0] = block[0].strip() substitution_node['names'].append( nodes.whitespace_normalize_name(subname)) new_abs_offset, blank_finish = self.nested_list_parse( block, input_offset=offset, node=substitution_node, initial_state='SubstitutionDef', blank_finish=blank_finish) i = 0 for node in substitution_node[:]: if not (isinstance(node, nodes.Inline) or isinstance(node, nodes.Text)): self.parent += substitution_node[i] del substitution_node[i] else: i += 1 for node in substitution_node.traverse(nodes.Element): if self.disallowed_inside_substitution_definitions(node): pformat = nodes.literal_block('', node.pformat().rstrip()) msg = self.reporter.error( 'Substitution definition contains illegal element:', pformat, nodes.literal_block(blocktext, blocktext), source=src, line=srcline) return [msg], blank_finish if len(substitution_node) == 0: msg = self.reporter.warning( 'Substitution definition "%s" empty or invalid.' % subname, nodes.literal_block(blocktext, blocktext), source=src, line=srcline) return [msg], blank_finish self.document.note_substitution_def( substitution_node, subname, self.parent) return [substitution_node], blank_finish def disallowed_inside_substitution_definitions(self, node): if (node['ids'] or isinstance(node, nodes.reference) and node.get('anonymous') or isinstance(node, nodes.footnote_reference) and node.get('auto')): return 1 else: return 0 def directive(self, match, **option_presets): """Returns a 2-tuple: list of nodes, and a "blank finish" boolean.""" type_name = match.group(1) directive_class, messages = directives.directive( type_name, self.memo.language, self.document) self.parent += messages if directive_class: return self.run_directive( directive_class, match, type_name, option_presets) else: return self.unknown_directive(type_name) def run_directive(self, directive, match, type_name, option_presets): """ Parse a directive then run its directive function. Parameters: - `directive`: The class implementing the directive. Must be a subclass of `rst.Directive`. - `match`: A regular expression match object which matched the first line of the directive. - `type_name`: The directive name, as used in the source text. - `option_presets`: A dictionary of preset options, defaults for the directive options. Currently, only an "alt" option is passed by substitution definitions (value: the substitution name), which may be used by an embedded image directive. Returns a 2-tuple: list of nodes, and a "blank finish" boolean. """ if isinstance(directive, (FunctionType, MethodType)): from docutils.parsers.rst import convert_directive_function directive = convert_directive_function(directive) lineno = self.state_machine.abs_line_number() initial_line_offset = self.state_machine.line_offset indented, indent, line_offset, blank_finish \ = self.state_machine.get_first_known_indented(match.end(), strip_top=0) block_text = '\n'.join(self.state_machine.input_lines[ initial_line_offset : self.state_machine.line_offset + 1]) try: arguments, options, content, content_offset = ( self.parse_directive_block(indented, line_offset, directive, option_presets)) except MarkupError, detail: error = self.reporter.error( 'Error in "%s" directive:\n%s.' % (type_name, ' '.join(detail.args)), nodes.literal_block(block_text, block_text), line=lineno) return [error], blank_finish directive_instance = directive( type_name, arguments, options, content, lineno, content_offset, block_text, self, self.state_machine) try: result = directive_instance.run() except docutils.parsers.rst.DirectiveError, error: msg_node = self.reporter.system_message(error.level, error.msg, line=lineno) msg_node += nodes.literal_block(block_text, block_text) result = [msg_node] assert isinstance(result, list), \ 'Directive "%s" must return a list of nodes.' % type_name for i in range(len(result)): assert isinstance(result[i], nodes.Node), \ ('Directive "%s" returned non-Node object (index %s): %r' % (type_name, i, result[i])) return (result, blank_finish or self.state_machine.is_next_line_blank()) def parse_directive_block(self, indented, line_offset, directive, option_presets): option_spec = directive.option_spec has_content = directive.has_content if indented and not indented[0].strip(): indented.trim_start() line_offset += 1 while indented and not indented[-1].strip(): indented.trim_end() if indented and (directive.required_arguments or directive.optional_arguments or option_spec): for i, line in enumerate(indented): if not line.strip(): break else: i += 1 arg_block = indented[:i] content = indented[i+1:] content_offset = line_offset + i + 1 else: content = indented content_offset = line_offset arg_block = [] if option_spec: options, arg_block = self.parse_directive_options( option_presets, option_spec, arg_block) else: options = {} if arg_block and not (directive.required_arguments or directive.optional_arguments): content = arg_block + indented[i:] content_offset = line_offset arg_block = [] while content and not content[0].strip(): content.trim_start() content_offset += 1 if directive.required_arguments or directive.optional_arguments: arguments = self.parse_directive_arguments( directive, arg_block) else: arguments = [] if content and not has_content: raise MarkupError('no content permitted') return (arguments, options, content, content_offset) def parse_directive_options(self, option_presets, option_spec, arg_block): options = option_presets.copy() for i, line in enumerate(arg_block): if re.match(Body.patterns['field_marker'], line): opt_block = arg_block[i:] arg_block = arg_block[:i] break else: opt_block = [] if opt_block: success, data = self.parse_extension_options(option_spec, opt_block) if success: # data is a dict of options options.update(data) else: # data is an error string raise MarkupError(data) return options, arg_block def parse_directive_arguments(self, directive, arg_block): required = directive.required_arguments optional = directive.optional_arguments arg_text = '\n'.join(arg_block) arguments = arg_text.split() if len(arguments) < required: raise MarkupError('%s argument(s) required, %s supplied' % (required, len(arguments))) elif len(arguments) > required + optional: if directive.final_argument_whitespace: arguments = arg_text.split(None, required + optional - 1) else: raise MarkupError( 'maximum %s argument(s) allowed, %s supplied' % (required + optional, len(arguments))) return arguments def parse_extension_options(self, option_spec, datalines): """ Parse `datalines` for a field list containing extension options matching `option_spec`. :Parameters: - `option_spec`: a mapping of option name to conversion function, which should raise an exception on bad input. - `datalines`: a list of input strings. :Return: - Success value, 1 or 0. - An option dictionary on success, an error string on failure. """ node = nodes.field_list() newline_offset, blank_finish = self.nested_list_parse( datalines, 0, node, initial_state='ExtensionOptions', blank_finish=True) if newline_offset != len(datalines): # incomplete parse of block return 0, 'invalid option block' try: options = utils.extract_extension_options(node, option_spec) except KeyError, detail: return 0, ('unknown option: "%s"' % detail.args[0]) except (ValueError, TypeError), detail: return 0, ('invalid option value: %s' % ' '.join(detail.args)) except utils.ExtensionOptionError, detail: return 0, ('invalid option data: %s' % ' '.join(detail.args)) if blank_finish: return 1, options else: return 0, 'option data incompletely parsed' def unknown_directive(self, type_name): lineno = self.state_machine.abs_line_number() indented, indent, offset, blank_finish = \ self.state_machine.get_first_known_indented(0, strip_indent=False) text = '\n'.join(indented) error = self.reporter.error( 'Unknown directive type "%s".' % type_name, nodes.literal_block(text, text), line=lineno) return [error], blank_finish def comment(self, match): if not match.string[match.end():].strip() \ and self.state_machine.is_next_line_blank(): # an empty comment? return [nodes.comment()], 1 # "A tiny but practical wart." indented, indent, offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end()) while indented and not indented[-1].strip(): indented.trim_end() text = '\n'.join(indented) return [nodes.comment(text, text)], blank_finish explicit.constructs = [ (footnote, re.compile(r""" \.\.[ ]+ # explicit markup start \[ ( # footnote label: [0-9]+ # manually numbered footnote | # *OR* \# # anonymous auto-numbered footnote | # *OR* \#%s # auto-number ed?) footnote label | # *OR* \* # auto-symbol footnote ) \] ([ ]+|$) # whitespace or end of line """ % Inliner.simplename, re.VERBOSE | re.UNICODE)), (citation, re.compile(r""" \.\.[ ]+ # explicit markup start \[(%s)\] # citation label ([ ]+|$) # whitespace or end of line """ % Inliner.simplename, re.VERBOSE | re.UNICODE)), (hyperlink_target, re.compile(r""" \.\.[ ]+ # explicit markup start _ # target indicator (?![ ]|$) # first char. not space or EOL """, re.VERBOSE | re.UNICODE)), (substitution_def, re.compile(r""" \.\.[ ]+ # explicit markup start \| # substitution indicator (?![ ]|$) # first char. not space or EOL """, re.VERBOSE | re.UNICODE)), (directive, re.compile(r""" \.\.[ ]+ # explicit markup start (%s) # directive name [ ]? # optional space :: # directive delimiter ([ ]+|$) # whitespace or end of line """ % Inliner.simplename, re.VERBOSE | re.UNICODE))] def explicit_markup(self, match, context, next_state): """Footnotes, hyperlink targets, directives, comments.""" nodelist, blank_finish = self.explicit_construct(match) self.parent += nodelist self.explicit_list(blank_finish) return [], next_state, [] def explicit_construct(self, match): """Determine which explicit construct this is, parse & return it.""" errors = [] for method, pattern in self.explicit.constructs: expmatch = pattern.match(match.string) if expmatch: try: return method(self, expmatch) except MarkupError, error: lineno = self.state_machine.abs_line_number() message = ' '.join(error.args) errors.append(self.reporter.warning(message, line=lineno)) break nodelist, blank_finish = self.comment(match) return nodelist + errors, blank_finish def explicit_list(self, blank_finish): """ Create a nested state machine for a series of explicit markup constructs (including anonymous hyperlink targets). """ offset = self.state_machine.line_offset + 1 # next line newline_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=self.parent, initial_state='Explicit', blank_finish=blank_finish, match_titles=self.state_machine.match_titles) self.goto_line(newline_offset) if not blank_finish: self.parent += self.unindent_warning('Explicit markup') def anonymous(self, match, context, next_state): """Anonymous hyperlink targets.""" nodelist, blank_finish = self.anonymous_target(match) self.parent += nodelist self.explicit_list(blank_finish) return [], next_state, [] def anonymous_target(self, match): lineno = self.state_machine.abs_line_number() block, indent, offset, blank_finish \ = self.state_machine.get_first_known_indented(match.end(), until_blank=True) blocktext = match.string[:match.end()] + '\n'.join(block) block = [escape2null(line) for line in block] target = self.make_target(block, blocktext, lineno, '') return [target], blank_finish def line(self, match, context, next_state): """Section title overline or transition marker.""" if self.state_machine.match_titles: return [match.string], 'Line', [] elif match.string.strip() == '::': raise statemachine.TransitionCorrection('text') elif len(match.string.strip()) < 4: msg = self.reporter.info( 'Unexpected possible title overline or transition.\n' "Treating it as ordinary text because it's so short.", line=self.state_machine.abs_line_number()) self.parent += msg raise statemachine.TransitionCorrection('text') else: blocktext = self.state_machine.line msg = self.reporter.severe( 'Unexpected section title or transition.', nodes.literal_block(blocktext, blocktext), line=self.state_machine.abs_line_number()) self.parent += msg return [], next_state, [] def text(self, match, context, next_state): """Titles, definition lists, paragraphs.""" return [match.string], 'Text', [] class RFC2822Body(Body): """ RFC2822 headers are only valid as the first constructs in documents. As soon as anything else appears, the `Body` state should take over. """ patterns = Body.patterns.copy() # can't modify the original patterns['rfc2822'] = r'[!-9;-~]+:( +|$)' initial_transitions = [(name, 'Body') for name in Body.initial_transitions] initial_transitions.insert(-1, ('rfc2822', 'Body')) # just before 'text' def rfc2822(self, match, context, next_state): """RFC2822-style field list item.""" fieldlist = nodes.field_list(classes=['rfc2822']) self.parent += fieldlist field, blank_finish = self.rfc2822_field(match) fieldlist += field offset = self.state_machine.line_offset + 1 # next line newline_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=fieldlist, initial_state='RFC2822List', blank_finish=blank_finish) self.goto_line(newline_offset) if not blank_finish: self.parent += self.unindent_warning( 'RFC2822-style field list') return [], next_state, [] def rfc2822_field(self, match): name = match.string[:match.string.find(':')] indented, indent, line_offset, blank_finish = \ self.state_machine.get_first_known_indented(match.end(), until_blank=True) fieldnode = nodes.field() fieldnode += nodes.field_name(name, name) fieldbody = nodes.field_body('\n'.join(indented)) fieldnode += fieldbody if indented: self.nested_parse(indented, input_offset=line_offset, node=fieldbody) return fieldnode, blank_finish class SpecializedBody(Body): """ Superclass for second and subsequent compound element members. Compound elements are lists and list-like constructs. All transition methods are disabled (redefined as `invalid_input`). Override individual methods in subclasses to re-enable. For example, once an initial bullet list item, say, is recognized, the `BulletList` subclass takes over, with a "bullet_list" node as its container. Upon encountering the initial bullet list item, `Body.bullet` calls its ``self.nested_list_parse`` (`RSTState.nested_list_parse`), which starts up a nested parsing session with `BulletList` as the initial state. Only the ``bullet`` transition method is enabled in `BulletList`; as long as only bullet list items are encountered, they are parsed and inserted into the container. The first construct which is *not* a bullet list item triggers the `invalid_input` method, which ends the nested parse and closes the container. `BulletList` needs to recognize input that is invalid in the context of a bullet list, which means everything *other than* bullet list items, so it inherits the transition list created in `Body`. """ def invalid_input(self, match=None, context=None, next_state=None): """Not a compound element member. Abort this state machine.""" self.state_machine.previous_line() # back up so parent SM can reassess raise EOFError indent = invalid_input bullet = invalid_input enumerator = invalid_input field_marker = invalid_input option_marker = invalid_input doctest = invalid_input line_block = invalid_input grid_table_top = invalid_input simple_table_top = invalid_input explicit_markup = invalid_input anonymous = invalid_input line = invalid_input text = invalid_input class BulletList(SpecializedBody): """Second and subsequent bullet_list list_items.""" def bullet(self, match, context, next_state): """Bullet list item.""" if match.string[0] != self.parent['bullet']: # different bullet: new list self.invalid_input() listitem, blank_finish = self.list_item(match.end()) self.parent += listitem self.blank_finish = blank_finish return [], next_state, [] class DefinitionList(SpecializedBody): """Second and subsequent definition_list_items.""" def text(self, match, context, next_state): """Definition lists.""" return [match.string], 'Definition', [] class EnumeratedList(SpecializedBody): """Second and subsequent enumerated_list list_items.""" def enumerator(self, match, context, next_state): """Enumerated list item.""" format, sequence, text, ordinal = self.parse_enumerator( match, self.parent['enumtype']) if ( format != self.format or (sequence != '#' and (sequence != self.parent['enumtype'] or self.auto or ordinal != (self.lastordinal + 1))) or not self.is_enumerated_list_item(ordinal, sequence, format)): # different enumeration: new list self.invalid_input() if sequence == '#': self.auto = 1 listitem, blank_finish = self.list_item(match.end()) self.parent += listitem self.blank_finish = blank_finish self.lastordinal = ordinal return [], next_state, [] class FieldList(SpecializedBody): """Second and subsequent field_list fields.""" def field_marker(self, match, context, next_state): """Field list field.""" field, blank_finish = self.field(match) self.parent += field self.blank_finish = blank_finish return [], next_state, [] class OptionList(SpecializedBody): """Second and subsequent option_list option_list_items.""" def option_marker(self, match, context, next_state): """Option list item.""" try: option_list_item, blank_finish = self.option_list_item(match) except MarkupError: self.invalid_input() self.parent += option_list_item self.blank_finish = blank_finish return [], next_state, [] class RFC2822List(SpecializedBody, RFC2822Body): """Second and subsequent RFC2822-style field_list fields.""" patterns = RFC2822Body.patterns initial_transitions = RFC2822Body.initial_transitions def rfc2822(self, match, context, next_state): """RFC2822-style field list item.""" field, blank_finish = self.rfc2822_field(match) self.parent += field self.blank_finish = blank_finish return [], 'RFC2822List', [] blank = SpecializedBody.invalid_input class ExtensionOptions(FieldList): """ Parse field_list fields for extension options. No nested parsing is done (including inline markup parsing). """ def parse_field_body(self, indented, offset, node): """Override `Body.parse_field_body` for simpler parsing.""" lines = [] for line in list(indented) + ['']: if line.strip(): lines.append(line) elif lines: text = '\n'.join(lines) node += nodes.paragraph(text, text) lines = [] class LineBlock(SpecializedBody): """Second and subsequent lines of a line_block.""" blank = SpecializedBody.invalid_input def line_block(self, match, context, next_state): """New line of line block.""" lineno = self.state_machine.abs_line_number() line, messages, blank_finish = self.line_block_line(match, lineno) self.parent += line self.parent.parent += messages self.blank_finish = blank_finish return [], next_state, [] class Explicit(SpecializedBody): """Second and subsequent explicit markup construct.""" def explicit_markup(self, match, context, next_state): """Footnotes, hyperlink targets, directives, comments.""" nodelist, blank_finish = self.explicit_construct(match) self.parent += nodelist self.blank_finish = blank_finish return [], next_state, [] def anonymous(self, match, context, next_state): """Anonymous hyperlink targets.""" nodelist, blank_finish = self.anonymous_target(match) self.parent += nodelist self.blank_finish = blank_finish return [], next_state, [] blank = SpecializedBody.invalid_input class SubstitutionDef(Body): """ Parser for the contents of a substitution_definition element. """ patterns = { 'embedded_directive': re.compile(r'(%s)::( +|$)' % Inliner.simplename, re.UNICODE), 'text': r''} initial_transitions = ['embedded_directive', 'text'] def embedded_directive(self, match, context, next_state): nodelist, blank_finish = self.directive(match, alt=self.parent['names'][0]) self.parent += nodelist if not self.state_machine.at_eof(): self.blank_finish = blank_finish raise EOFError def text(self, match, context, next_state): if not self.state_machine.at_eof(): self.blank_finish = self.state_machine.is_next_line_blank() raise EOFError class Text(RSTState): """ Classifier of second line of a text block. Could be a paragraph, a definition list item, or a title. """ patterns = {'underline': Body.patterns['line'], 'text': r''} initial_transitions = [('underline', 'Body'), ('text', 'Body')] def blank(self, match, context, next_state): """End of paragraph.""" # NOTE: self.paragraph returns [ node, system_message(s) ], literalnext paragraph, literalnext = self.paragraph( context, self.state_machine.abs_line_number() - 1) self.parent += paragraph if literalnext: self.parent += self.literal_block() return [], 'Body', [] def eof(self, context): if context: self.blank(None, context, None) return [] def indent(self, match, context, next_state): """Definition list item.""" definitionlist = nodes.definition_list() definitionlistitem, blank_finish = self.definition_list_item(context) definitionlist += definitionlistitem self.parent += definitionlist offset = self.state_machine.line_offset + 1 # next line newline_offset, blank_finish = self.nested_list_parse( self.state_machine.input_lines[offset:], input_offset=self.state_machine.abs_line_offset() + 1, node=definitionlist, initial_state='DefinitionList', blank_finish=blank_finish, blank_finish_state='Definition') self.goto_line(newline_offset) if not blank_finish: self.parent += self.unindent_warning('Definition list') return [], 'Body', [] def underline(self, match, context, next_state): """Section title.""" lineno = self.state_machine.abs_line_number() title = context[0].rstrip() underline = match.string.rstrip() source = title + '\n' + underline messages = [] if column_width(title) > len(underline): if len(underline) < 4: if self.state_machine.match_titles: msg = self.reporter.info( 'Possible title underline, too short for the title.\n' "Treating it as ordinary text because it's so short.", line=lineno) self.parent += msg raise statemachine.TransitionCorrection('text') else: blocktext = context[0] + '\n' + self.state_machine.line msg = self.reporter.warning('Title underline too short.', nodes.literal_block(blocktext, blocktext), line=lineno) messages.append(msg) if not self.state_machine.match_titles: blocktext = context[0] + '\n' + self.state_machine.line # We need get_source_and_line() here to report correctly src, srcline = self.state_machine.get_source_and_line() # TODO: why is abs_line_number() == srcline+1 # if the error is in a table (try with test_tables.py)? # print "get_source_and_line", srcline # print "abs_line_number", self.state_machine.abs_line_number() msg = self.reporter.severe('Unexpected section title.', nodes.literal_block(blocktext, blocktext), source=src, line=srcline) self.parent += messages self.parent += msg return [], next_state, [] style = underline[0] context[:] = [] self.section(title, source, style, lineno - 1, messages) return [], next_state, [] def text(self, match, context, next_state): """Paragraph.""" startline = self.state_machine.abs_line_number() - 1 msg = None try: block = self.state_machine.get_text_block(flush_left=True) except statemachine.UnexpectedIndentationError, err: block, src, srcline = err.args msg = self.reporter.error('Unexpected indentation.', source=src, line=srcline) lines = context + list(block) paragraph, literalnext = self.paragraph(lines, startline) self.parent += paragraph self.parent += msg if literalnext: try: self.state_machine.next_line() except EOFError: pass self.parent += self.literal_block() return [], next_state, [] def literal_block(self): """Return a list of nodes.""" indented, indent, offset, blank_finish = \ self.state_machine.get_indented() while indented and not indented[-1].strip(): indented.trim_end() if not indented: return self.quoted_literal_block() data = '\n'.join(indented) literal_block = nodes.literal_block(data, data) literal_block.line = offset + 1 nodelist = [literal_block] if not blank_finish: nodelist.append(self.unindent_warning('Literal block')) return nodelist def quoted_literal_block(self): abs_line_offset = self.state_machine.abs_line_offset() offset = self.state_machine.line_offset parent_node = nodes.Element() new_abs_offset = self.nested_parse( self.state_machine.input_lines[offset:], input_offset=abs_line_offset, node=parent_node, match_titles=False, state_machine_kwargs={'state_classes': (QuotedLiteralBlock,), 'initial_state': 'QuotedLiteralBlock'}) self.goto_line(new_abs_offset) return parent_node.children def definition_list_item(self, termline): indented, indent, line_offset, blank_finish = \ self.state_machine.get_indented() itemnode = nodes.definition_list_item( '\n'.join(termline + list(indented))) lineno = self.state_machine.abs_line_number() - 1 (itemnode.source, itemnode.line) = self.state_machine.get_source_and_line(lineno) termlist, messages = self.term(termline, lineno) itemnode += termlist definition = nodes.definition('', *messages) itemnode += definition if termline[0][-2:] == '::': definition += self.reporter.info( 'Blank line missing before literal block (after the "::")? ' 'Interpreted as a definition list item.', line=lineno+1) self.nested_parse(indented, input_offset=line_offset, node=definition) return itemnode, blank_finish classifier_delimiter = re.compile(' +: +') def term(self, lines, lineno): """Return a definition_list's term and optional classifiers.""" assert len(lines) == 1 text_nodes, messages = self.inline_text(lines[0], lineno) term_node = nodes.term() (term_node.source, term_node.line) = self.state_machine.get_source_and_line(lineno) term_node.rawsource = unescape(lines[0]) node_list = [term_node] for i in range(len(text_nodes)): node = text_nodes[i] if isinstance(node, nodes.Text): parts = self.classifier_delimiter.split(node.rawsource) if len(parts) == 1: node_list[-1] += node else: node_list[-1] += nodes.Text(parts[0].rstrip()) for part in parts[1:]: classifier_node = nodes.classifier('', part) node_list.append(classifier_node) else: node_list[-1] += node return node_list, messages class SpecializedText(Text): """ Superclass for second and subsequent lines of Text-variants. All transition methods are disabled. Override individual methods in subclasses to re-enable. """ def eof(self, context): """Incomplete construct.""" return [] def invalid_input(self, match=None, context=None, next_state=None): """Not a compound element member. Abort this state machine.""" raise EOFError blank = invalid_input indent = invalid_input underline = invalid_input text = invalid_input class Definition(SpecializedText): """Second line of potential definition_list_item.""" def eof(self, context): """Not a definition.""" self.state_machine.previous_line(2) # so parent SM can reassess return [] def indent(self, match, context, next_state): """Definition list item.""" itemnode, blank_finish = self.definition_list_item(context) self.parent += itemnode self.blank_finish = blank_finish return [], 'DefinitionList', [] class Line(SpecializedText): """ Second line of over- & underlined section title or transition marker. """ eofcheck = 1 # @@@ ??? """Set to 0 while parsing sections, so that we don't catch the EOF.""" def eof(self, context): """Transition marker at end of section or document.""" marker = context[0].strip() if self.memo.section_bubble_up_kludge: self.memo.section_bubble_up_kludge = False elif len(marker) < 4: self.state_correction(context) if self.eofcheck: # ignore EOFError with sections lineno = self.state_machine.abs_line_number() - 1 transition = nodes.transition(rawsource=context[0]) transition.line = lineno self.parent += transition self.eofcheck = 1 return [] def blank(self, match, context, next_state): """Transition marker.""" src, srcline = self.state_machine.get_source_and_line() marker = context[0].strip() if len(marker) < 4: self.state_correction(context) transition = nodes.transition(rawsource=marker) transition.source = src transition.line = srcline - 1 self.parent += transition return [], 'Body', [] def text(self, match, context, next_state): """Potential over- & underlined title.""" lineno = self.state_machine.abs_line_number() - 1 overline = context[0] title = match.string underline = '' try: underline = self.state_machine.next_line() except EOFError: blocktext = overline + '\n' + title if len(overline.rstrip()) < 4: self.short_overline(context, blocktext, lineno, 2) else: msg = self.reporter.severe( 'Incomplete section title.', nodes.literal_block(blocktext, blocktext), line=lineno) self.parent += msg return [], 'Body', [] source = '%s\n%s\n%s' % (overline, title, underline) overline = overline.rstrip() underline = underline.rstrip() if not self.transitions['underline'][0].match(underline): blocktext = overline + '\n' + title + '\n' + underline if len(overline.rstrip()) < 4: self.short_overline(context, blocktext, lineno, 2) else: msg = self.reporter.severe( 'Missing matching underline for section title overline.', nodes.literal_block(source, source), line=lineno) self.parent += msg return [], 'Body', [] elif overline != underline: blocktext = overline + '\n' + title + '\n' + underline if len(overline.rstrip()) < 4: self.short_overline(context, blocktext, lineno, 2) else: msg = self.reporter.severe( 'Title overline & underline mismatch.', nodes.literal_block(source, source), line=lineno) self.parent += msg return [], 'Body', [] title = title.rstrip() messages = [] if column_width(title) > len(overline): blocktext = overline + '\n' + title + '\n' + underline if len(overline.rstrip()) < 4: self.short_overline(context, blocktext, lineno, 2) else: msg = self.reporter.warning( 'Title overline too short.', nodes.literal_block(source, source), line=lineno) messages.append(msg) style = (overline[0], underline[0]) self.eofcheck = 0 # @@@ not sure this is correct self.section(title.lstrip(), source, style, lineno + 1, messages) self.eofcheck = 1 return [], 'Body', [] indent = text # indented title def underline(self, match, context, next_state): overline = context[0] blocktext = overline + '\n' + self.state_machine.line lineno = self.state_machine.abs_line_number() - 1 if len(overline.rstrip()) < 4: self.short_overline(context, blocktext, lineno, 1) msg = self.reporter.error( 'Invalid section title or transition marker.', nodes.literal_block(blocktext, blocktext), line=lineno) self.parent += msg return [], 'Body', [] def short_overline(self, context, blocktext, lineno, lines=1): msg = self.reporter.info( 'Possible incomplete section title.\nTreating the overline as ' "ordinary text because it's so short.", line=lineno) self.parent += msg self.state_correction(context, lines) def state_correction(self, context, lines=1): self.state_machine.previous_line(lines) context[:] = [] raise statemachine.StateCorrection('Body', 'text') class QuotedLiteralBlock(RSTState): """ Nested parse handler for quoted (unindented) literal blocks. Special-purpose. Not for inclusion in `state_classes`. """ patterns = {'initial_quoted': r'(%(nonalphanum7bit)s)' % Body.pats, 'text': r''} initial_transitions = ('initial_quoted', 'text') def __init__(self, state_machine, debug=False): RSTState.__init__(self, state_machine, debug) self.messages = [] self.initial_lineno = None def blank(self, match, context, next_state): if context: raise EOFError else: return context, next_state, [] def eof(self, context): if context: src, srcline = self.state_machine.get_source_and_line( self.initial_lineno) text = '\n'.join(context) literal_block = nodes.literal_block(text, text) literal_block.source = src literal_block.line = srcline self.parent += literal_block else: self.parent += self.reporter.warning( 'Literal block expected; none found.', line=self.state_machine.abs_line_number()) # src not available, because statemachine.input_lines is empty self.state_machine.previous_line() self.parent += self.messages return [] def indent(self, match, context, next_state): assert context, ('QuotedLiteralBlock.indent: context should not ' 'be empty!') self.messages.append( self.reporter.error('Unexpected indentation.', line=self.state_machine.abs_line_number())) self.state_machine.previous_line() raise EOFError def initial_quoted(self, match, context, next_state): """Match arbitrary quote character on the first line only.""" self.remove_transition('initial_quoted') quote = match.string[0] pattern = re.compile(re.escape(quote), re.UNICODE) # New transition matches consistent quotes only: self.add_transition('quoted', (pattern, self.quoted, self.__class__.__name__)) self.initial_lineno = self.state_machine.abs_line_number() return [match.string], next_state, [] def quoted(self, match, context, next_state): """Match consistent quotes on subsequent lines.""" context.append(match.string) return context, next_state, [] def text(self, match, context, next_state): if context: self.messages.append( self.reporter.error('Inconsistent literal block quoting.', line=self.state_machine.abs_line_number())) self.state_machine.previous_line() raise EOFError state_classes = (Body, BulletList, DefinitionList, EnumeratedList, FieldList, OptionList, LineBlock, ExtensionOptions, Explicit, Text, Definition, Line, SubstitutionDef, RFC2822Body, RFC2822List) """Standard set of State classes used to start `RSTStateMachine`."""