diff options
Diffstat (limited to 'Doc/tools/sphinxext/suspicious.py')
-rw-r--r-- | Doc/tools/sphinxext/suspicious.py | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/Doc/tools/sphinxext/suspicious.py b/Doc/tools/sphinxext/suspicious.py new file mode 100644 index 0000000..ae11793 --- /dev/null +++ b/Doc/tools/sphinxext/suspicious.py @@ -0,0 +1,237 @@ +""" +Try to detect suspicious constructs, resembling markup +that has leaked into the final output. + +Suspicious lines are reported in a comma-separated-file, +``suspicious.csv``, located in the output directory. + +The file is utf-8 encoded, and each line contains four fields: + + * document name (normalized) + * line number in the source document + * problematic text + * complete line showing the problematic text in context + +It is common to find many false positives. To avoid reporting them +again and again, they may be added to the ``ignored.csv`` file +(located in the configuration directory). The file has the same +format as ``suspicious.csv`` with a few differences: + + - each line defines a rule; if the rule matches, the issue + is ignored. + - line number may be empty (that is, nothing between the + commas: ",,"). In this case, line numbers are ignored (the + rule matches anywhere in the file). + - the last field does not have to be a complete line; some + surrounding text (never more than a line) is enough for + context. + +Rules are processed sequentially. A rule matches when: + + * document names are the same + * problematic texts are the same + * line numbers are close to each other (5 lines up or down) + * the rule text is completely contained into the source line + +The simplest way to create the ignored.csv file is by copying +undesired entries from suspicious.csv (possibly trimming the last +field.) + +Copyright 2009 Gabriel A. Genellina + +""" + +import os, sys +import csv +import re +from docutils import nodes +from sphinx.builders import Builder + +detect_all = re.compile(ur''' + ::(?=[^=])| # two :: (but NOT ::=) + :[a-zA-Z][a-zA-Z0-9]+| # :foo + `| # ` (seldom used by itself) + (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:) + ''', re.UNICODE | re.VERBOSE).finditer + +class Rule: + def __init__(self, docname, lineno, issue, line): + "A rule for ignoring issues" + self.docname = docname # document to which this rule applies + self.lineno = lineno # line number in the original source; + # this rule matches only near that. + # None -> don't care + self.issue = issue # the markup fragment that triggered this rule + self.line = line # text of the container element (single line only) + + +class CheckSuspiciousMarkupBuilder(Builder): + """ + Checks for possibly invalid markup that may leak into the output + """ + name = 'suspicious' + + def init(self): + # create output file + self.log_file_name = os.path.join(self.outdir, 'suspicious.csv') + open(self.log_file_name, 'w').close() + # load database of previously ignored issues + self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv')) + + def get_outdated_docs(self): + return self.env.found_docs + + def get_target_uri(self, docname, typ=None): + return '' + + def prepare_writing(self, docnames): + ### PYTHON PROJECT SPECIFIC ### + for name in set(docnames): + if name.split('/', 1)[0] == 'documenting': + docnames.remove(name) + ### PYTHON PROJECT SPECIFIC ### + + def write_doc(self, docname, doctree): + self.any_issue = False # set when any issue is encountered in this document + self.docname = docname + visitor = SuspiciousVisitor(doctree, self) + doctree.walk(visitor) + + def finish(self): + return + + def check_issue(self, line, lineno, issue): + if not self.is_ignored(line, lineno, issue): + self.report_issue(line, lineno, issue) + + def is_ignored(self, line, lineno, issue): + """Determine whether this issue should be ignored. + """ + docname = self.docname + for rule in self.rules: + if rule.docname != docname: continue + if rule.issue != issue: continue + # Both lines must match *exactly*. This is rather strict, + # and probably should be improved. + # Doing fuzzy matches with levenshtein distance could work, + # but that means bringing other libraries... + # Ok, relax that requirement: just check if the rule fragment + # is contained in the document line + if rule.line not in line: continue + # Check both line numbers. If they're "near" + # this rule matches. (lineno=None means "don't care") + if (rule.lineno is not None) and \ + abs(rule.lineno - lineno) > 5: continue + # if it came this far, the rule matched + return True + return False + + def report_issue(self, text, lineno, issue): + if not self.any_issue: self.info() + self.any_issue = True + self.write_log_entry(lineno, issue, text) + self.warn('[%s:%d] "%s" found in "%-.120s"' % ( + self.docname.encode(sys.getdefaultencoding(),'replace'), + lineno, + issue.encode(sys.getdefaultencoding(),'replace'), + text.strip().encode(sys.getdefaultencoding(),'replace'))) + self.app.statuscode = 1 + + def write_log_entry(self, lineno, issue, text): + f = open(self.log_file_name, 'ab') + writer = csv.writer(f) + writer.writerow([self.docname.encode('utf-8'), + lineno, + issue.encode('utf-8'), + text.strip().encode('utf-8')]) + del writer + f.close() + + def load_rules(self, filename): + """Load database of previously ignored issues. + + A csv file, with exactly the same format as suspicious.csv + Fields: document name (normalized), line number, issue, surrounding text + """ + self.info("loading ignore rules... ", nonl=1) + self.rules = rules = [] + try: f = open(filename, 'rb') + except IOError: return + for i, row in enumerate(csv.reader(f)): + if len(row) != 4: + raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row) + docname, lineno, issue, text = row + docname = docname.decode('utf-8') + if lineno: lineno = int(lineno) + else: lineno = None + issue = issue.decode('utf-8') + text = text.decode('utf-8') + rule = Rule(docname, lineno, issue, text) + rules.append(rule) + f.close() + self.info('done, %d rules loaded' % len(self.rules)) + + +def get_lineno(node): + "Obtain line number information for a node" + lineno = None + while lineno is None and node: + node = node.parent + lineno = node.line + return lineno + + +def extract_line(text, index): + """text may be a multiline string; extract + only the line containing the given character index. + + >>> extract_line("abc\ndefgh\ni", 6) + >>> 'defgh' + >>> for i in (0, 2, 3, 4, 10): + ... print extract_line("abc\ndefgh\ni", i) + abc + abc + abc + defgh + defgh + i + """ + p = text.rfind('\n', 0, index) + 1 + q = text.find('\n', index) + if q<0: q = len(text) + return text[p:q] + + +class SuspiciousVisitor(nodes.GenericNodeVisitor): + + lastlineno = 0 + + def __init__(self, document, builder): + nodes.GenericNodeVisitor.__init__(self, document) + self.builder = builder + + def default_visit(self, node): + if isinstance(node, (nodes.Text, nodes.image)): # direct text containers + text = node.astext() + # lineno seems to go backwards sometimes (?) + self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno) + seen = set() # don't report the same issue more than only once per line + for match in detect_all(text): + #import pdb; pdb.set_trace() + issue = match.group() + line = extract_line(text, match.start()) + if (issue, line) not in seen: + self.builder.check_issue(line, lineno, issue) + seen.add((issue, line)) + + unknown_visit = default_visit + + def visit_document(self, node): + self.lastlineno = 0 + + def visit_comment(self, node): + # ignore comments -- too much false positives. + # (although doing this could miss some errors; + # there were two sections "commented-out" by mistake + # in the Python docs that would not be catched) + raise nodes.SkipNode |