bugout-dev · zomglings · May 19, 2021 · May 20, 2021 · May 20, 2021 · May 20, 2021
diff --git a/mirror/__init__.py b/mirror/__init__.py
@@ -7,7 +7,7 @@
 
 __email__ = "engineering@bugout.dev"
 __license__ = "MIT"
-__version__ = "0.2.6"
+__version__ = "0.3.0"
 
 __all__ = (
     "__author__",

diff --git a/mirror/cli.py b/mirror/cli.py
@@ -10,6 +10,7 @@
 from .github.generate_snippets import generate_datasets
 from .github.sync import handler as sync_populator
 from .github.licenses import licenses_handler as licenses_populator
+from .data import stacktraces
 
 
 @click.group()
@@ -18,19 +19,42 @@ def mirror() -> None:
     pass
 
 
-@mirror.command()
+@mirror.command(
+    context_settings={
+        "help_option_names": ["-h", "--help"],
+    },
+)
 def version() -> None:
     click.echo(__version__)
 
 
-mirror.add_command(crawl_populator, name="crawl")
-mirror.add_command(nextid_populator, name="nextid")
-mirror.add_command(sample_populator, name="sample")
-mirror.add_command(validate_populator, name="validate")
-mirror.add_command(popular_repos, name="search")
-mirror.add_command(clone_repos, name="clone")
-mirror.add_command(generate_datasets, name="generate_snippets")
-mirror.add_command(commits, name="commits")
+@mirror.group("github")
+def mirror_github() -> None:
+    pass
+
+
+mirror_github.add_command(crawl_populator, name="crawl")
+mirror_github.add_command(nextid_populator, name="nextid")
+mirror_github.add_command(sample_populator, name="sample")
+mirror_github.add_command(validate_populator, name="validate")
+mirror_github.add_command(popular_repos, name="search")
+mirror_github.add_command(clone_repos, name="clone")
+mirror_github.add_command(generate_datasets, name="generate_snippets")
+mirror_github.add_command(commits, name="commits")
+
+
+@mirror.group(
+    "data",
+    context_settings={
+        "help_option_names": ["-h", "--help"],
+    },
+)
+def mirror_data() -> None:
+    pass
+
+
+mirror_data.add_command(stacktraces.handler, "stacktraces")
+
 
 cli = click.CommandCollection(sources=[mirror])
 

diff --git a/mirror/data/__init__.py b/mirror/data/__init__.py
diff --git a/mirror/data/stacktraces.py b/mirror/data/stacktraces.py
@@ -0,0 +1,157 @@
+"""
+Extracts stack traces from documents
+"""
+
+import json
+import sqlite3
+import sys
+from typing import Any, Callable, Dict, List, Optional
+
+import click
+from tqdm import tqdm
+
+
+def cpython_extractor(document: str) -> List[str]:
+    document_lines = document.split("\n")
+    stacktraces: List[str] = []
+    current_stacktrace: List[str] = []
+    in_stacktrace: bool = False
+    indentation: str = ""
+    for line in document_lines:
+        stripped_line = line.lstrip()
+        if not in_stacktrace and "Traceback (most recent call last):" in stripped_line:
+            in_stacktrace = True
+            indentation = " " * (len(line) - len(stripped_line))
+            current_stacktrace.append(line)
+        elif in_stacktrace:
+            current_stacktrace.append(line)
+            if (
+                line == f"{indentation}{stripped_line}"
+                or line == stripped_line
+                or stripped_line.rstrip() == ""
+            ):
+                indentation = ""
+                in_stacktrace = False
+                stacktraces.append("".join(current_stacktrace))
+                current_stacktrace = []
+
+    if current_stacktrace:
+        stacktraces.append("".join(current_stacktrace))
+
+    return stacktraces
+
+
+def ipython_extractor(document: str) -> List[str]:
+    document_lines = document.split("\n")
+    stacktraces: List[str] = []
+    current_stacktrace: List[str] = []
+    in_stacktrace: bool = False
+    indentation: str = ""
+    for line in document_lines:
+        stripped_line = line.lstrip()
+        # IPython tracebacks don't have a ":" at the end of the Traceback message
+        if not in_stacktrace and "Traceback (most recent call last)" in stripped_line:
+            in_stacktrace = True
+            indentation = " " * (len(line) - len(stripped_line))
+            current_stacktrace.append(line)
+        elif in_stacktrace:
+            current_stacktrace.append(line)
+            if len(stripped_line) > 0 and (
+                (line == f"{indentation}{stripped_line}" or line == stripped_line)
+                and stripped_line[0].isalpha()
+            ):
+                indentation = ""
+                in_stacktrace = False
+                stacktraces.append("".join(current_stacktrace))
+                current_stacktrace = []
+
+    if current_stacktrace:
+        stacktraces.append("".join(current_stacktrace))
+
+    return stacktraces
+
+
+EXTRACTORS: Dict[str, Callable[[str], List[str]]] = {
+    "cpython": cpython_extractor,
+    "ipython": ipython_extractor,
+}
+
+
+@click.command(
+    context_settings={
+        "help_option_names": ["-h", "--help"],
+    }
+)
+@click.option(
+    "-r",
+    "--runtime",
+    type=click.Choice(list(EXTRACTORS), case_sensitive=False),
+    required=False,
+    default=None,
+    help="Runtime for which to extract stack traces from the given document",
+)
+@click.option(
+    "-l",
+    "--like",
+    type=str,
+    required=False,
+    default=None,
+    help='Filter to be used in LIKE clause on body column in SQL query (e.g. -f "%Traceback%")',
+)
+@click.argument("infile", type=click.Path(exists=True))
+def handler(
+    infile: str, like: Optional[str] = None, runtime: Optional[str] = None
+) -> None:
+    """
+    Accepts a path to a SQLite database containing GitHub issues and an optional filter over the
+    bodies of those issues.
+
+    Extracts the stack traces generated by the given runtime from the bodies of rows that match the
+    given filter.
+
+    Result is JSON object of the form:
+        {
+            "like": "<filter>",
+            "infile": "<infile>",
+            "data": [
+                {
+                    "id": <id of row corresponding to the issue>,
+                    "html_url": "<URL at which the issue can be viewed>",
+                    "stacktraces": {
+                        "<extractor_name>": ["<list of stacktraces extracted from the issue by the given extractor>"],
+                        ...
+                    }
+                },
+                ...
+            ]
+        }
+    """
+    extractor_names: List[str] = list(EXTRACTORS)
+    if runtime is not None:
+        extractor_names = [runtime]
+
+    result: Dict[str, Any] = {"like": like, "infile": infile, "data": []}
+
+    con = sqlite3.connect(infile)
+    try:
+        cursor = con.cursor()
+        query = "SELECT id, html_url, body FROM issues"
+        if like is not None:
+            query = f"{query} WHERE body LIKE '{like}'"
+        rows = cursor.execute(query)
+        for id, html_url, body in tqdm(rows):
+            stacktraces: Dict[str, List[str]] = {}
+            for extractor_name in extractor_names:
+                extractor = EXTRACTORS[extractor_name.lower()]
+                stacktraces[extractor_name] = extractor(body)
+            result["data"].append(
+                {
+                    "id": id,
+                    "html_url": html_url,
+                    "stacktraces": stacktraces,
+                }
+            )
+    finally:
+        con.close()
+
+    json.dump(result, sys.stdout)
diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,4 @@
+[mypy]
+
+[mypy-tqdm.*]
+ignore_missing_imports = True
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
     long_description = ifp.read()
 
 setup(
-    name=MODULE_NAME,
+    name="bugout-mirror",
     version=module.__version__,
     author=module.__author__,
     author_email=module.__email__,
@@ -45,8 +45,6 @@
         "requests",
         "tqdm",
     ],
-    extras_require={
-        "dev": ["black", "mypy", "jupyter"]
-    },
+    extras_require={"dev": ["black", "mypy", "jupyter"]},
     entry_points={"console_scripts": ["{0} = {0}.cli:cli".format(MODULE_NAME)]},
 )