1#!/usr/bin/env python 2#===- cppreference_parser.py - ------------------------------*- python -*--===# 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8#===------------------------------------------------------------------------===# 9 10from bs4 import BeautifulSoup, NavigableString 11 12import collections 13import multiprocessing 14import os 15import re 16import signal 17import sys 18 19 20class Symbol: 21 22 def __init__(self, name, namespace, headers): 23 # unqualifed symbol name, e.g. "move" 24 self.name = name 25 # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) 26 # None for C symbols. 27 self.namespace = namespace 28 # a list of corresponding headers 29 self.headers = headers 30 31 32def _HasClass(tag, *classes): 33 for c in tag.get('class', []): 34 if c in classes: 35 return True 36 return False 37 38 39def _ParseSymbolPage(symbol_page_html, symbol_name): 40 """Parse symbol page and retrieve the include header defined in this page. 41 The symbol page provides header for the symbol, specifically in 42 "Defined in header <header>" section. An example: 43 44 <tr class="t-dsc-header"> 45 <td colspan="2"> <div>Defined in header <code><ratio></code> </div> 46 </td></tr> 47 48 Returns a list of headers. 49 """ 50 headers = set() 51 all_headers = set() 52 53 soup = BeautifulSoup(symbol_page_html, "html.parser") 54 # Rows in table are like: 55 # Defined in header <foo> .t-dsc-header 56 # Defined in header <bar> .t-dsc-header 57 # decl1 .t-dcl 58 # Defined in header <baz> .t-dsc-header 59 # decl2 .t-dcl 60 for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): 61 current_headers = [] 62 was_decl = False 63 for row in table.select('tr'): 64 if _HasClass(row, 't-dcl', 't-dsc'): 65 was_decl = True 66 # Symbols are in the first cell. 67 found_symbols = row.find('td').stripped_strings 68 if not symbol_name in found_symbols: 69 continue 70 headers.update(current_headers) 71 elif _HasClass(row, 't-dsc-header'): 72 # If we saw a decl since the last header, this is a new block of headers 73 # for a new block of decls. 74 if was_decl: 75 current_headers = [] 76 was_decl = False 77 # There are also .t-dsc-header for "defined in namespace". 78 if not "Defined in header " in row.text: 79 continue 80 # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. 81 for header_code in row.find_all("code"): 82 current_headers.append(header_code.text) 83 all_headers.add(header_code.text) 84 # If the symbol was never named, consider all named headers. 85 return headers or all_headers 86 87 88def _ParseIndexPage(index_page_html): 89 """Parse index page. 90 The index page lists all std symbols and hrefs to their detailed pages 91 (which contain the defined header). An example: 92 93 <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> 94 <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> 95 96 Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). 97 """ 98 symbols = [] 99 soup = BeautifulSoup(index_page_html, "html.parser") 100 for symbol_href in soup.select("a[title]"): 101 # Ignore annotated symbols like "acos<>() (std::complex)". 102 # These tend to be overloads, and we the primary is more useful. 103 # This accidentally accepts begin/end despite the (iterator) caption: the 104 # (since C++11) note is first. They are good symbols, so the bug is unfixed. 105 caption = symbol_href.next_sibling 106 variant = isinstance(caption, NavigableString) and "(" in caption 107 symbol_tt = symbol_href.find("tt") 108 if symbol_tt: 109 symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() 110 symbol_href["href"], variant)) 111 return symbols 112 113 114def _ReadSymbolPage(path, name): 115 with open(path) as f: 116 return _ParseSymbolPage(f.read(), name) 117 118 119def _GetSymbols(pool, root_dir, index_page_name, namespace): 120 """Get all symbols listed in the index page. All symbols should be in the 121 given namespace. 122 123 Returns a list of Symbols. 124 """ 125 126 # Workflow steps: 127 # 1. Parse index page which lists all symbols to get symbol 128 # name (unqualified name) and its href link to the symbol page which 129 # contains the defined header. 130 # 2. Parse the symbol page to get the defined header. 131 index_page_path = os.path.join(root_dir, index_page_name) 132 with open(index_page_path, "r") as f: 133 # Read each symbol page in parallel. 134 results = [] # (symbol_name, promise of [header...]) 135 for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): 136 # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. 137 # FIXME: use these as a fallback rather than ignoring entirely. 138 if variant: 139 continue 140 path = os.path.join(root_dir, symbol_page_path) 141 results.append((symbol_name, 142 pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) 143 144 # Build map from symbol name to a set of headers. 145 symbol_headers = collections.defaultdict(set) 146 for symbol_name, lazy_headers in results: 147 symbol_headers[symbol_name].update(lazy_headers.get()) 148 149 symbols = [] 150 for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): 151 symbols.append(Symbol(name, namespace, list(headers))) 152 return symbols 153 154 155def GetSymbols(parse_pages): 156 """Get all symbols by parsing the given pages. 157 158 Args: 159 parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) 160 """ 161 symbols = [] 162 # Run many workers to process individual symbol pages under the symbol index. 163 # Don't allow workers to capture Ctrl-C. 164 pool = multiprocessing.Pool( 165 initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) 166 try: 167 for root_dir, page_name, namespace in parse_pages: 168 symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace)) 169 finally: 170 pool.terminate() 171 pool.join() 172 return symbols 173