diff --git a/mirror/__init__.py b/mirror/__init__.py index ae889c6..321cd3f 100644 --- a/mirror/__init__.py +++ b/mirror/__init__.py @@ -7,7 +7,7 @@ __email__ = "engineering@bugout.dev" __license__ = "MIT" -__version__ = "0.2.6" +__version__ = "0.3.0" __all__ = ( "__author__", diff --git a/mirror/cli.py b/mirror/cli.py index 45922d2..be4c486 100644 --- a/mirror/cli.py +++ b/mirror/cli.py @@ -10,6 +10,7 @@ from .github.generate_snippets import generate_datasets from .github.sync import handler as sync_populator from .github.licenses import licenses_handler as licenses_populator +from .data import stacktraces @click.group() @@ -18,19 +19,42 @@ def mirror() -> None: pass -@mirror.command() +@mirror.command( + context_settings={ + "help_option_names": ["-h", "--help"], + }, +) def version() -> None: click.echo(__version__) -mirror.add_command(crawl_populator, name="crawl") -mirror.add_command(nextid_populator, name="nextid") -mirror.add_command(sample_populator, name="sample") -mirror.add_command(validate_populator, name="validate") -mirror.add_command(popular_repos, name="search") -mirror.add_command(clone_repos, name="clone") -mirror.add_command(generate_datasets, name="generate_snippets") -mirror.add_command(commits, name="commits") +@mirror.group("github") +def mirror_github() -> None: + pass + + +mirror_github.add_command(crawl_populator, name="crawl") +mirror_github.add_command(nextid_populator, name="nextid") +mirror_github.add_command(sample_populator, name="sample") +mirror_github.add_command(validate_populator, name="validate") +mirror_github.add_command(popular_repos, name="search") +mirror_github.add_command(clone_repos, name="clone") +mirror_github.add_command(generate_datasets, name="generate_snippets") +mirror_github.add_command(commits, name="commits") + + +@mirror.group( + "data", + context_settings={ + "help_option_names": ["-h", "--help"], + }, +) +def mirror_data() -> None: + pass + + +mirror_data.add_command(stacktraces.handler, "stacktraces") + cli = click.CommandCollection(sources=[mirror]) diff --git a/mirror/data/__init__.py b/mirror/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirror/data/stacktraces.py b/mirror/data/stacktraces.py new file mode 100644 index 0000000..bd62120 --- /dev/null +++ b/mirror/data/stacktraces.py @@ -0,0 +1,157 @@ +""" +Extracts stack traces from documents +""" + +import json +import sqlite3 +import sys +from typing import Any, Callable, Dict, List, Optional + +import click +from tqdm import tqdm + + +def cpython_extractor(document: str) -> List[str]: + document_lines = document.split("\n") + stacktraces: List[str] = [] + current_stacktrace: List[str] = [] + in_stacktrace: bool = False + indentation: str = "" + for line in document_lines: + stripped_line = line.lstrip() + if not in_stacktrace and "Traceback (most recent call last):" in stripped_line: + in_stacktrace = True + indentation = " " * (len(line) - len(stripped_line)) + current_stacktrace.append(line) + elif in_stacktrace: + current_stacktrace.append(line) + if ( + line == f"{indentation}{stripped_line}" + or line == stripped_line + or stripped_line.rstrip() == "" + ): + indentation = "" + in_stacktrace = False + stacktraces.append("".join(current_stacktrace)) + current_stacktrace = [] + + if current_stacktrace: + stacktraces.append("".join(current_stacktrace)) + + return stacktraces + + +def ipython_extractor(document: str) -> List[str]: + document_lines = document.split("\n") + stacktraces: List[str] = [] + current_stacktrace: List[str] = [] + in_stacktrace: bool = False + indentation: str = "" + for line in document_lines: + stripped_line = line.lstrip() + # IPython tracebacks don't have a ":" at the end of the Traceback message + if not in_stacktrace and "Traceback (most recent call last)" in stripped_line: + in_stacktrace = True + indentation = " " * (len(line) - len(stripped_line)) + current_stacktrace.append(line) + elif in_stacktrace: + current_stacktrace.append(line) + if len(stripped_line) > 0 and ( + (line == f"{indentation}{stripped_line}" or line == stripped_line) + and stripped_line[0].isalpha() + ): + indentation = "" + in_stacktrace = False + stacktraces.append("".join(current_stacktrace)) + current_stacktrace = [] + + if current_stacktrace: + stacktraces.append("".join(current_stacktrace)) + + return stacktraces + + +EXTRACTORS: Dict[str, Callable[[str], List[str]]] = { + "cpython": cpython_extractor, + "ipython": ipython_extractor, +} + + +@click.command( + context_settings={ + "help_option_names": ["-h", "--help"], + } +) +@click.option( + "-r", + "--runtime", + type=click.Choice(list(EXTRACTORS), case_sensitive=False), + required=False, + default=None, + help="Runtime for which to extract stack traces from the given document", +) +@click.option( + "-l", + "--like", + type=str, + required=False, + default=None, + help='Filter to be used in LIKE clause on body column in SQL query (e.g. -f "%Traceback%")', +) +@click.argument("infile", type=click.Path(exists=True)) +def handler( + infile: str, like: Optional[str] = None, runtime: Optional[str] = None +) -> None: + """ + Accepts a path to a SQLite database containing GitHub issues and an optional filter over the + bodies of those issues. + + Extracts the stack traces generated by the given runtime from the bodies of rows that match the + given filter. + + Result is JSON object of the form: + { + "like": "", + "infile": "", + "data": [ + { + "id": , + "html_url": "", + "stacktraces": { + "": [""], + ... + } + }, + ... + ] + } + """ + extractor_names: List[str] = list(EXTRACTORS) + if runtime is not None: + extractor_names = [runtime] + + result: Dict[str, Any] = {"like": like, "infile": infile, "data": []} + + con = sqlite3.connect(infile) + try: + cursor = con.cursor() + query = "SELECT id, html_url, body FROM issues" + if like is not None: + query = f"{query} WHERE body LIKE '{like}'" + rows = cursor.execute(query) + for id, html_url, body in tqdm(rows): + stacktraces: Dict[str, List[str]] = {} + for extractor_name in extractor_names: + extractor = EXTRACTORS[extractor_name.lower()] + stacktraces[extractor_name] = extractor(body) + result["data"].append( + { + "id": id, + "html_url": html_url, + "stacktraces": stacktraces, + } + ) + finally: + con.close() + + json.dump(result, sys.stdout) diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..73cd8cc --- /dev/null +++ b/mypy.ini @@ -0,0 +1,4 @@ +[mypy] + +[mypy-tqdm.*] +ignore_missing_imports = True diff --git a/setup.py b/setup.py index 4fe1256..b207cf5 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ long_description = ifp.read() setup( - name=MODULE_NAME, + name="bugout-mirror", version=module.__version__, author=module.__author__, author_email=module.__email__, @@ -45,8 +45,6 @@ "requests", "tqdm", ], - extras_require={ - "dev": ["black", "mypy", "jupyter"] - }, + extras_require={"dev": ["black", "mypy", "jupyter"]}, entry_points={"console_scripts": ["{0} = {0}.cli:cli".format(MODULE_NAME)]}, )