• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2CORE MARKDOWN BLOCKPARSER
3=============================================================================
4
5This parser handles basic parsing of Markdown blocks.  It doesn't concern itself
6with inline elements such as **bold** or *italics*, but rather just catches
7blocks, lists, quotes, etc.
8
9The BlockParser is made up of a bunch of BlockProssors, each handling a
10different type of block. Extensions may add/replace/remove BlockProcessors
11as they need to alter how markdown blocks are parsed.
12
13"""
14
15import re
16import markdown
17
18class BlockProcessor:
19    """ Base class for block processors.
20
21    Each subclass will provide the methods below to work with the source and
22    tree. Each processor will need to define it's own ``test`` and ``run``
23    methods. The ``test`` method should return True or False, to indicate
24    whether the current block should be processed by this processor. If the
25    test passes, the parser will call the processors ``run`` method.
26
27    """
28
29    def __init__(self, parser=None):
30        self.parser = parser
31
32    def lastChild(self, parent):
33        """ Return the last child of an etree element. """
34        if len(parent):
35            return parent[-1]
36        else:
37            return None
38
39    def detab(self, text):
40        """ Remove a tab from the front of each line of the given text. """
41        newtext = []
42        lines = text.split('\n')
43        for line in lines:
44            if line.startswith(' '*markdown.TAB_LENGTH):
45                newtext.append(line[markdown.TAB_LENGTH:])
46            elif not line.strip():
47                newtext.append('')
48            else:
49                break
50        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
51
52    def looseDetab(self, text, level=1):
53        """ Remove a tab from front of lines but allowing dedented lines. """
54        lines = text.split('\n')
55        for i in range(len(lines)):
56            if lines[i].startswith(' '*markdown.TAB_LENGTH*level):
57                lines[i] = lines[i][markdown.TAB_LENGTH*level:]
58        return '\n'.join(lines)
59
60    def test(self, parent, block):
61        """ Test for block type. Must be overridden by subclasses.
62
63        As the parser loops through processors, it will call the ``test`` method
64        on each to determine if the given block of text is of that type. This
65        method must return a boolean ``True`` or ``False``. The actual method of
66        testing is left to the needs of that particular block type. It could
67        be as simple as ``block.startswith(some_string)`` or a complex regular
68        expression. As the block type may be different depending on the parent
69        of the block (i.e. inside a list), the parent etree element is also
70        provided and may be used as part of the test.
71
72        Keywords:
73
74        * ``parent``: A etree element which will be the parent of the block.
75        * ``block``: A block of text from the source which has been split at
76            blank lines.
77        """
78        pass
79
80    def run(self, parent, blocks):
81        """ Run processor. Must be overridden by subclasses.
82
83        When the parser determines the appropriate type of a block, the parser
84        will call the corresponding processor's ``run`` method. This method
85        should parse the individual lines of the block and append them to
86        the etree.
87
88        Note that both the ``parent`` and ``etree`` keywords are pointers
89        to instances of the objects which should be edited in place. Each
90        processor must make changes to the existing objects as there is no
91        mechanism to return new/different objects to replace them.
92
93        This means that this method should be adding SubElements or adding text
94        to the parent, and should remove (``pop``) or add (``insert``) items to
95        the list of blocks.
96
97        Keywords:
98
99        * ``parent``: A etree element which is the parent of the current block.
100        * ``blocks``: A list of all remaining blocks of the document.
101        """
102        pass
103
104
105class ListIndentProcessor(BlockProcessor):
106    """ Process children of list items.
107
108    Example:
109        * a list item
110            process this part
111
112            or this part
113
114    """
115
116    INDENT_RE = re.compile(r'^(([ ]{%s})+)'% markdown.TAB_LENGTH)
117    ITEM_TYPES = ['li']
118    LIST_TYPES = ['ul', 'ol']
119
120    def test(self, parent, block):
121        return block.startswith(' '*markdown.TAB_LENGTH) and \
122                not self.parser.state.isstate('detabbed') and  \
123                (parent.tag in self.ITEM_TYPES or \
124                    (len(parent) and parent[-1] and \
125                        (parent[-1].tag in self.LIST_TYPES)
126                    )
127                )
128
129    def run(self, parent, blocks):
130        block = blocks.pop(0)
131        level, sibling = self.get_level(parent, block)
132        block = self.looseDetab(block, level)
133
134        self.parser.state.set('detabbed')
135        if parent.tag in self.ITEM_TYPES:
136            # The parent is already a li. Just parse the child block.
137            self.parser.parseBlocks(parent, [block])
138        elif sibling.tag in self.ITEM_TYPES:
139            # The sibling is a li. Use it as parent.
140            self.parser.parseBlocks(sibling, [block])
141        elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:
142            # The parent is a list (``ol`` or ``ul``) which has children.
143            # Assume the last child li is the parent of this block.
144            if sibling[-1].text:
145                # If the parent li has text, that text needs to be moved to a p
146                block = '%s\n\n%s' % (sibling[-1].text, block)
147                sibling[-1].text = ''
148            self.parser.parseChunk(sibling[-1], block)
149        else:
150            self.create_item(sibling, block)
151        self.parser.state.reset()
152
153    def create_item(self, parent, block):
154        """ Create a new li and parse the block with it as the parent. """
155        li = markdown.etree.SubElement(parent, 'li')
156        self.parser.parseBlocks(li, [block])
157
158    def get_level(self, parent, block):
159        """ Get level of indent based on list level. """
160        # Get indent level
161        m = self.INDENT_RE.match(block)
162        if m:
163            indent_level = len(m.group(1))/markdown.TAB_LENGTH
164        else:
165            indent_level = 0
166        if self.parser.state.isstate('list'):
167            # We're in a tightlist - so we already are at correct parent.
168            level = 1
169        else:
170            # We're in a looselist - so we need to find parent.
171            level = 0
172        # Step through children of tree to find matching indent level.
173        while indent_level > level:
174            child = self.lastChild(parent)
175            if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES):
176                if child.tag in self.LIST_TYPES:
177                    level += 1
178                parent = child
179            else:
180                # No more child levels. If we're short of indent_level,
181                # we have a code block. So we stop here.
182                break
183        return level, parent
184
185
186class CodeBlockProcessor(BlockProcessor):
187    """ Process code blocks. """
188
189    def test(self, parent, block):
190        return block.startswith(' '*markdown.TAB_LENGTH)
191
192    def run(self, parent, blocks):
193        sibling = self.lastChild(parent)
194        block = blocks.pop(0)
195        theRest = ''
196        if sibling and sibling.tag == "pre" and len(sibling) \
197                    and sibling[0].tag == "code":
198            # The previous block was a code block. As blank lines do not start
199            # new code blocks, append this block to the previous, adding back
200            # linebreaks removed from the split into a list.
201            code = sibling[0]
202            block, theRest = self.detab(block)
203            code.text = markdown.AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
204        else:
205            # This is a new codeblock. Create the elements and insert text.
206            pre = markdown.etree.SubElement(parent, 'pre')
207            code = markdown.etree.SubElement(pre, 'code')
208            block, theRest = self.detab(block)
209            code.text = markdown.AtomicString('%s\n' % block.rstrip())
210        if theRest:
211            # This block contained unindented line(s) after the first indented
212            # line. Insert these lines as the first block of the master blocks
213            # list for future processing.
214            blocks.insert(0, theRest)
215
216
217class BlockQuoteProcessor(BlockProcessor):
218
219    RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
220
221    def test(self, parent, block):
222        return bool(self.RE.search(block))
223
224    def run(self, parent, blocks):
225        block = blocks.pop(0)
226        m = self.RE.search(block)
227        if m:
228            before = block[:m.start()] # Lines before blockquote
229            # Pass lines before blockquote in recursively for parsing forst.
230            self.parser.parseBlocks(parent, [before])
231            # Remove ``> `` from begining of each line.
232            block = '\n'.join([self.clean(line) for line in
233                            block[m.start():].split('\n')])
234        sibling = self.lastChild(parent)
235        if sibling and sibling.tag == "blockquote":
236            # Previous block was a blockquote so set that as this blocks parent
237            quote = sibling
238        else:
239            # This is a new blockquote. Create a new parent element.
240            quote = markdown.etree.SubElement(parent, 'blockquote')
241        # Recursively parse block with blockquote as parent.
242        self.parser.parseChunk(quote, block)
243
244    def clean(self, line):
245        """ Remove ``>`` from beginning of a line. """
246        m = self.RE.match(line)
247        if line.strip() == ">":
248            return ""
249        elif m:
250            return m.group(2)
251        else:
252            return line
253
254class OListProcessor(BlockProcessor):
255    """ Process ordered list blocks. """
256
257    TAG = 'ol'
258    # Detect an item (``1. item``). ``group(1)`` contains contents of item.
259    RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')
260    # Detect items on secondary lines. they can be of either list type.
261    CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')
262    # Detect indented (nested) items of either type
263    INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')
264
265    def test(self, parent, block):
266        return bool(self.RE.match(block))
267
268    def run(self, parent, blocks):
269        # Check fr multiple items in one block.
270        items = self.get_items(blocks.pop(0))
271        sibling = self.lastChild(parent)
272        if sibling and sibling.tag in ['ol', 'ul']:
273            # Previous block was a list item, so set that as parent
274            lst = sibling
275            # make sure previous item is in a p.
276            if len(lst) and lst[-1].text and not len(lst[-1]):
277                p = markdown.etree.SubElement(lst[-1], 'p')
278                p.text = lst[-1].text
279                lst[-1].text = ''
280            # parse first block differently as it gets wrapped in a p.
281            li = markdown.etree.SubElement(lst, 'li')
282            self.parser.state.set('looselist')
283            firstitem = items.pop(0)
284            self.parser.parseBlocks(li, [firstitem])
285            self.parser.state.reset()
286        else:
287            # This is a new list so create parent with appropriate tag.
288            lst = markdown.etree.SubElement(parent, self.TAG)
289        self.parser.state.set('list')
290        # Loop through items in block, recursively parsing each with the
291        # appropriate parent.
292        for item in items:
293            if item.startswith(' '*markdown.TAB_LENGTH):
294                # Item is indented. Parse with last item as parent
295                self.parser.parseBlocks(lst[-1], [item])
296            else:
297                # New item. Create li and parse with it as parent
298                li = markdown.etree.SubElement(lst, 'li')
299                self.parser.parseBlocks(li, [item])
300        self.parser.state.reset()
301
302    def get_items(self, block):
303        """ Break a block into list items. """
304        items = []
305        for line in block.split('\n'):
306            m = self.CHILD_RE.match(line)
307            if m:
308                # This is a new item. Append
309                items.append(m.group(3))
310            elif self.INDENT_RE.match(line):
311                # This is an indented (possibly nested) item.
312                if items[-1].startswith(' '*markdown.TAB_LENGTH):
313                    # Previous item was indented. Append to that item.
314                    items[-1] = '%s\n%s' % (items[-1], line)
315                else:
316                    items.append(line)
317            else:
318                # This is another line of previous item. Append to that item.
319                items[-1] = '%s\n%s' % (items[-1], line)
320        return items
321
322
323class UListProcessor(OListProcessor):
324    """ Process unordered list blocks. """
325
326    TAG = 'ul'
327    RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')
328
329
330class HashHeaderProcessor(BlockProcessor):
331    """ Process Hash Headers. """
332
333    # Detect a header at start of any line in block
334    RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
335
336    def test(self, parent, block):
337        return bool(self.RE.search(block))
338
339    def run(self, parent, blocks):
340        block = blocks.pop(0)
341        m = self.RE.search(block)
342        if m:
343            before = block[:m.start()] # All lines before header
344            after = block[m.end():]    # All lines after header
345            if before:
346                # As the header was not the first line of the block and the
347                # lines before the header must be parsed first,
348                # recursively parse this lines as a block.
349                self.parser.parseBlocks(parent, [before])
350            # Create header using named groups from RE
351            h = markdown.etree.SubElement(parent, 'h%d' % len(m.group('level')))
352            h.text = m.group('header').strip()
353            if after:
354                # Insert remaining lines as first block for future parsing.
355                blocks.insert(0, after)
356        else:
357            # This should never happen, but just in case...
358            message(CRITICAL, "We've got a problem header!")
359
360
361class SetextHeaderProcessor(BlockProcessor):
362    """ Process Setext-style Headers. """
363
364    # Detect Setext-style header. Must be first 2 lines of block.
365    RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
366
367    def test(self, parent, block):
368        return bool(self.RE.match(block))
369
370    def run(self, parent, blocks):
371        lines = blocks.pop(0).split('\n')
372        # Determine level. ``=`` is 1 and ``-`` is 2.
373        if lines[1].startswith('='):
374            level = 1
375        else:
376            level = 2
377        h = markdown.etree.SubElement(parent, 'h%d' % level)
378        h.text = lines[0].strip()
379        if len(lines) > 2:
380            # Block contains additional lines. Add to  master blocks for later.
381            blocks.insert(0, '\n'.join(lines[2:]))
382
383
384class HRProcessor(BlockProcessor):
385    """ Process Horizontal Rules. """
386
387    RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*'
388    # Detect hr on any line of a block.
389    SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
390    # Match a hr on a single line of text.
391    MATCH_RE = re.compile(r'^%s$' % RE)
392
393    def test(self, parent, block):
394        return bool(self.SEARCH_RE.search(block))
395
396    def run(self, parent, blocks):
397        lines = blocks.pop(0).split('\n')
398        prelines = []
399        # Check for lines in block before hr.
400        for line in lines:
401            m = self.MATCH_RE.match(line)
402            if m:
403                break
404            else:
405                prelines.append(line)
406        if len(prelines):
407            # Recursively parse lines before hr so they get parsed first.
408            self.parser.parseBlocks(parent, ['\n'.join(prelines)])
409        # create hr
410        hr = markdown.etree.SubElement(parent, 'hr')
411        # check for lines in block after hr.
412        lines = lines[len(prelines)+1:]
413        if len(lines):
414            # Add lines after hr to master blocks for later parsing.
415            blocks.insert(0, '\n'.join(lines))
416
417
418class EmptyBlockProcessor(BlockProcessor):
419    """ Process blocks and start with an empty line. """
420
421    # Detect a block that only contains whitespace
422    # or only whitespace on the first line.
423    RE = re.compile(r'^\s*\n')
424
425    def test(self, parent, block):
426        return bool(self.RE.match(block))
427
428    def run(self, parent, blocks):
429        block = blocks.pop(0)
430        m = self.RE.match(block)
431        if m:
432            # Add remaining line to master blocks for later.
433            blocks.insert(0, block[m.end():])
434            sibling = self.lastChild(parent)
435            if sibling and sibling.tag == 'pre' and sibling[0] and \
436                    sibling[0].tag == 'code':
437                # Last block is a codeblock. Append to preserve whitespace.
438                sibling[0].text = markdown.AtomicString('%s/n/n/n' % sibling[0].text )
439
440
441class ParagraphProcessor(BlockProcessor):
442    """ Process Paragraph blocks. """
443
444    def test(self, parent, block):
445        return True
446
447    def run(self, parent, blocks):
448        block = blocks.pop(0)
449        if block.strip():
450            # Not a blank block. Add to parent, otherwise throw it away.
451            if self.parser.state.isstate('list'):
452                # The parent is a tight-list. Append to parent.text
453                if parent.text:
454                    parent.text = '%s\n%s' % (parent.text, block)
455                else:
456                    parent.text = block.lstrip()
457            else:
458                # Create a regular paragraph
459                p = markdown.etree.SubElement(parent, 'p')
460                p.text = block.lstrip()
461