summaryrefslogtreecommitdiffstats
path: root/Doc/tools/suspicious.py
diff options
context:
space:
mode:
Diffstat (limited to 'Doc/tools/suspicious.py')
-rw-r--r--Doc/tools/suspicious.py274
1 files changed, 274 insertions, 0 deletions
diff --git a/Doc/tools/suspicious.py b/Doc/tools/suspicious.py
new file mode 100644
index 0000000..ee87733
--- /dev/null
+++ b/Doc/tools/suspicious.py
@@ -0,0 +1,274 @@
+"""
+Try to detect suspicious constructs, resembling markup
+that has leaked into the final output.
+
+Suspicious lines are reported in a comma-separated-file,
+``suspicious.csv``, located in the output directory.
+
+The file is utf-8 encoded, and each line contains four fields:
+
+ * document name (normalized)
+ * line number in the source document
+ * problematic text
+ * complete line showing the problematic text in context
+
+It is common to find many false positives. To avoid reporting them
+again and again, they may be added to the ``ignored.csv`` file
+(located in the configuration directory). The file has the same
+format as ``suspicious.csv`` with a few differences:
+
+ - each line defines a rule; if the rule matches, the issue
+ is ignored.
+ - line number may be empty (that is, nothing between the
+ commas: ",,"). In this case, line numbers are ignored (the
+ rule matches anywhere in the file).
+ - the last field does not have to be a complete line; some
+ surrounding text (never more than a line) is enough for
+ context.
+
+Rules are processed sequentially. A rule matches when:
+
+ * document names are the same
+ * problematic texts are the same
+ * line numbers are close to each other (5 lines up or down)
+ * the rule text is completely contained into the source line
+
+The simplest way to create the ignored.csv file is by copying
+undesired entries from suspicious.csv (possibly trimming the last
+field.)
+
+Copyright 2009 Gabriel A. Genellina
+
+"""
+
+import os
+import re
+import csv
+import sys
+
+from docutils import nodes
+from sphinx.builders import Builder
+
+detect_all = re.compile(r'''
+ ::(?=[^=])| # two :: (but NOT ::=)
+ :[a-zA-Z][a-zA-Z0-9]+| # :foo
+ `| # ` (seldom used by itself)
+ (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
+ ''', re.UNICODE | re.VERBOSE).finditer
+
+py3 = sys.version_info >= (3, 0)
+
+
+class Rule:
+ def __init__(self, docname, lineno, issue, line):
+ """A rule for ignoring issues"""
+ self.docname = docname # document to which this rule applies
+ self.lineno = lineno # line number in the original source;
+ # this rule matches only near that.
+ # None -> don't care
+ self.issue = issue # the markup fragment that triggered this rule
+ self.line = line # text of the container element (single line only)
+ self.used = False
+
+ def __repr__(self):
+ return '{0.docname},,{0.issue},{0.line}'.format(self)
+
+
+
+class dialect(csv.excel):
+ """Our dialect: uses only linefeed as newline."""
+ lineterminator = '\n'
+
+
+class CheckSuspiciousMarkupBuilder(Builder):
+ """
+ Checks for possibly invalid markup that may leak into the output.
+ """
+ name = 'suspicious'
+
+ def init(self):
+ # create output file
+ self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
+ open(self.log_file_name, 'w').close()
+ # load database of previously ignored issues
+ self.load_rules(os.path.join(os.path.dirname(__file__),
+ 'susp-ignored.csv'))
+
+ def get_outdated_docs(self):
+ return self.env.found_docs
+
+ def get_target_uri(self, docname, typ=None):
+ return ''
+
+ def prepare_writing(self, docnames):
+ pass
+
+ def write_doc(self, docname, doctree):
+ # set when any issue is encountered in this document
+ self.any_issue = False
+ self.docname = docname
+ visitor = SuspiciousVisitor(doctree, self)
+ doctree.walk(visitor)
+
+ def finish(self):
+ unused_rules = [rule for rule in self.rules if not rule.used]
+ if unused_rules:
+ self.warn('Found %s/%s unused rules:' %
+ (len(unused_rules), len(self.rules)))
+ for rule in unused_rules:
+ self.info(repr(rule))
+ return
+
+ def check_issue(self, line, lineno, issue):
+ if not self.is_ignored(line, lineno, issue):
+ self.report_issue(line, lineno, issue)
+
+ def is_ignored(self, line, lineno, issue):
+ """Determine whether this issue should be ignored."""
+ docname = self.docname
+ for rule in self.rules:
+ if rule.docname != docname: continue
+ if rule.issue != issue: continue
+ # Both lines must match *exactly*. This is rather strict,
+ # and probably should be improved.
+ # Doing fuzzy matches with levenshtein distance could work,
+ # but that means bringing other libraries...
+ # Ok, relax that requirement: just check if the rule fragment
+ # is contained in the document line
+ if rule.line not in line: continue
+ # Check both line numbers. If they're "near"
+ # this rule matches. (lineno=None means "don't care")
+ if (rule.lineno is not None) and \
+ abs(rule.lineno - lineno) > 5: continue
+ # if it came this far, the rule matched
+ rule.used = True
+ return True
+ return False
+
+ def report_issue(self, text, lineno, issue):
+ if not self.any_issue: self.info()
+ self.any_issue = True
+ self.write_log_entry(lineno, issue, text)
+ if py3:
+ self.warn('[%s:%d] "%s" found in "%-.120s"' %
+ (self.docname, lineno, issue, text))
+ else:
+ self.warn('[%s:%d] "%s" found in "%-.120s"' % (
+ self.docname.encode(sys.getdefaultencoding(),'replace'),
+ lineno,
+ issue.encode(sys.getdefaultencoding(),'replace'),
+ text.strip().encode(sys.getdefaultencoding(),'replace')))
+ self.app.statuscode = 1
+
+ def write_log_entry(self, lineno, issue, text):
+ if py3:
+ f = open(self.log_file_name, 'a')
+ writer = csv.writer(f, dialect)
+ writer.writerow([self.docname, lineno, issue, text.strip()])
+ f.close()
+ else:
+ f = open(self.log_file_name, 'ab')
+ writer = csv.writer(f, dialect)
+ writer.writerow([self.docname.encode('utf-8'),
+ lineno,
+ issue.encode('utf-8'),
+ text.strip().encode('utf-8')])
+ f.close()
+
+ def load_rules(self, filename):
+ """Load database of previously ignored issues.
+
+ A csv file, with exactly the same format as suspicious.csv
+ Fields: document name (normalized), line number, issue, surrounding text
+ """
+ self.info("loading ignore rules... ", nonl=1)
+ self.rules = rules = []
+ try:
+ if py3:
+ f = open(filename, 'r')
+ else:
+ f = open(filename, 'rb')
+ except IOError:
+ return
+ for i, row in enumerate(csv.reader(f)):
+ if len(row) != 4:
+ raise ValueError(
+ "wrong format in %s, line %d: %s" % (filename, i+1, row))
+ docname, lineno, issue, text = row
+ if lineno:
+ lineno = int(lineno)
+ else:
+ lineno = None
+ if not py3:
+ docname = docname.decode('utf-8')
+ issue = issue.decode('utf-8')
+ text = text.decode('utf-8')
+ rule = Rule(docname, lineno, issue, text)
+ rules.append(rule)
+ f.close()
+ self.info('done, %d rules loaded' % len(self.rules))
+
+
+def get_lineno(node):
+ """Obtain line number information for a node."""
+ lineno = None
+ while lineno is None and node:
+ node = node.parent
+ lineno = node.line
+ return lineno
+
+
+def extract_line(text, index):
+ """text may be a multiline string; extract
+ only the line containing the given character index.
+
+ >>> extract_line("abc\ndefgh\ni", 6)
+ >>> 'defgh'
+ >>> for i in (0, 2, 3, 4, 10):
+ ... print extract_line("abc\ndefgh\ni", i)
+ abc
+ abc
+ abc
+ defgh
+ defgh
+ i
+ """
+ p = text.rfind('\n', 0, index) + 1
+ q = text.find('\n', index)
+ if q < 0:
+ q = len(text)
+ return text[p:q]
+
+
+class SuspiciousVisitor(nodes.GenericNodeVisitor):
+
+ lastlineno = 0
+
+ def __init__(self, document, builder):
+ nodes.GenericNodeVisitor.__init__(self, document)
+ self.builder = builder
+
+ def default_visit(self, node):
+ if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
+ text = node.astext()
+ # lineno seems to go backwards sometimes (?)
+ self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
+ seen = set() # don't report the same issue more than only once per line
+ for match in detect_all(text):
+ issue = match.group()
+ line = extract_line(text, match.start())
+ if (issue, line) not in seen:
+ self.builder.check_issue(line, lineno, issue)
+ seen.add((issue, line))
+
+ unknown_visit = default_visit
+
+ def visit_document(self, node):
+ self.lastlineno = 0
+
+ def visit_comment(self, node):
+ # ignore comments -- too much false positives.
+ # (although doing this could miss some errors;
+ # there were two sections "commented-out" by mistake
+ # in the Python docs that would not be catched)
+ raise nodes.SkipNode