From 4a08993fac9832c6f4fdfded2cff094b53129328 Mon Sep 17 00:00:00 2001 From: Neeraj Kashyap Date: Wed, 19 May 2021 11:42:24 -0700 Subject: [PATCH 1/7] extract stacktraces from GH issues `mirror data stacktraces -h` for more info --- mirror/__init__.py | 2 +- mirror/cli.py | 42 +++++++--- mirror/data/__init__.py | 0 mirror/data/stacktraces.py | 157 +++++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+), 10 deletions(-) create mode 100644 mirror/data/__init__.py create mode 100644 mirror/data/stacktraces.py diff --git a/mirror/__init__.py b/mirror/__init__.py index ae889c6..321cd3f 100644 --- a/mirror/__init__.py +++ b/mirror/__init__.py @@ -7,7 +7,7 @@ __email__ = "engineering@bugout.dev" __license__ = "MIT" -__version__ = "0.2.6" +__version__ = "0.3.0" __all__ = ( "__author__", diff --git a/mirror/cli.py b/mirror/cli.py index 45922d2..be4c486 100644 --- a/mirror/cli.py +++ b/mirror/cli.py @@ -10,6 +10,7 @@ from .github.generate_snippets import generate_datasets from .github.sync import handler as sync_populator from .github.licenses import licenses_handler as licenses_populator +from .data import stacktraces @click.group() @@ -18,19 +19,42 @@ def mirror() -> None: pass -@mirror.command() +@mirror.command( + context_settings={ + "help_option_names": ["-h", "--help"], + }, +) def version() -> None: click.echo(__version__) -mirror.add_command(crawl_populator, name="crawl") -mirror.add_command(nextid_populator, name="nextid") -mirror.add_command(sample_populator, name="sample") -mirror.add_command(validate_populator, name="validate") -mirror.add_command(popular_repos, name="search") -mirror.add_command(clone_repos, name="clone") -mirror.add_command(generate_datasets, name="generate_snippets") -mirror.add_command(commits, name="commits") +@mirror.group("github") +def mirror_github() -> None: + pass + + +mirror_github.add_command(crawl_populator, name="crawl") +mirror_github.add_command(nextid_populator, name="nextid") +mirror_github.add_command(sample_populator, name="sample") +mirror_github.add_command(validate_populator, name="validate") +mirror_github.add_command(popular_repos, name="search") +mirror_github.add_command(clone_repos, name="clone") +mirror_github.add_command(generate_datasets, name="generate_snippets") +mirror_github.add_command(commits, name="commits") + + +@mirror.group( + "data", + context_settings={ + "help_option_names": ["-h", "--help"], + }, +) +def mirror_data() -> None: + pass + + +mirror_data.add_command(stacktraces.handler, "stacktraces") + cli = click.CommandCollection(sources=[mirror]) diff --git a/mirror/data/__init__.py b/mirror/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirror/data/stacktraces.py b/mirror/data/stacktraces.py new file mode 100644 index 0000000..511f26a --- /dev/null +++ b/mirror/data/stacktraces.py @@ -0,0 +1,157 @@ +""" +Extracts stack traces from documents +""" + +import json +import sqlite3 +import sys +from typing import Callable, Dict, List, Optional + +import click +from tqdm import tqdm + + +def cpython_extractor(document: str) -> List[str]: + document_lines = document.split("\n") + stacktraces: List[str] = [] + current_stacktrace: List[str] = [] + in_stacktrace: bool = False + indentation: str = "" + for line in document_lines: + stripped_line = line.lstrip() + if not in_stacktrace and "Traceback (most recent call last):" in stripped_line: + in_stacktrace = True + indentation = " " * (len(line) - len(stripped_line)) + current_stacktrace.append(line) + elif in_stacktrace: + current_stacktrace.append(line) + if ( + line == f"{indentation}{stripped_line}" + or line == stripped_line + or stripped_line.rstrip() == "" + ): + indentation = "" + in_stacktrace = False + stacktraces.append("".join(current_stacktrace)) + current_stacktrace = [] + + if current_stacktrace: + stacktraces.append("".join(current_stacktrace)) + + return stacktraces + + +def ipython_extractor(document: str) -> List[str]: + document_lines = document.split("\n") + stacktraces: List[str] = [] + current_stacktrace: List[str] = [] + in_stacktrace: bool = False + indentation: str = "" + for line in document_lines: + stripped_line = line.lstrip() + # IPython tracebacks don't have a ":" at the end of the Traceback message + if not in_stacktrace and "Traceback (most recent call last)" in stripped_line: + in_stacktrace = True + indentation = " " * (len(line) - len(stripped_line)) + current_stacktrace.append(line) + elif in_stacktrace: + current_stacktrace.append(line) + if len(stripped_line) > 0 and ( + (line == f"{indentation}{stripped_line}" or line == stripped_line) + and stripped_line[0].isalpha() + ): + indentation = "" + in_stacktrace = False + stacktraces.append("".join(current_stacktrace)) + current_stacktrace = [] + + if current_stacktrace: + stacktraces.append("".join(current_stacktrace)) + + return stacktraces + + +EXTRACTORS: Dict[str, Callable[[str], List[str]]] = { + "cpython": cpython_extractor, + "ipython": ipython_extractor, +} + + +@click.command( + context_settings={ + "help_option_names": ["-h", "--help"], + } +) +@click.option( + "-r", + "--runtime", + type=click.Choice(EXTRACTORS, case_sensitive=False), + required=False, + default=None, + help="Runtime for which to extract stack traces from the given document", +) +@click.option( + "-l", + "--like", + type=str, + required=False, + default=None, + help='Filter to be used in LIKE clause on body column in SQL query (e.g. -f "%Traceback%")', +) +@click.argument("infile", type=click.Path(exists=True)) +def handler( + infile: str, like: Optional[str] = None, runtime: Optional[str] = None +) -> None: + """ + Accepts a path to a SQLite database containing GitHub issues and an optional filter over the + bodies of those issues. + + Extracts the stack traces generated by the given runtime from the bodies of rows that match the + given filter. + + Result is JSON object of the form: + { + "like": "", + "infile": "", + "data": [ + { + "id": , + "html_url": "", + "stacktraces": { + "": [""], + ... + } + }, + ... + ] + } + """ + extractor_names: List[str] = list(EXTRACTORS) + if runtime is not None: + extractor_names = [runtime] + + result = {"like": like, "infile": infile, "data": []} + + con = sqlite3.connect(infile) + try: + cursor = con.cursor() + query = "SELECT id, html_url, body FROM issues" + if like is not None: + query = f"{query} WHERE body LIKE '{like}'" + rows = cursor.execute(query) + for id, html_url, body in tqdm(rows): + stacktraces: Dict[str, List[str]] = {} + for extractor_name in extractor_names: + extractor = EXTRACTORS[extractor_name.lower()] + stacktraces[extractor_name] = extractor(body) + result["data"].append( + { + "id": id, + "html_url": html_url, + "stacktraces": stacktraces, + } + ) + finally: + con.close() + + json.dump(result, sys.stdout) From 77b8529268b25b7b617eb05d2520f2d646803b84 Mon Sep 17 00:00:00 2001 From: Neeraj Kashyap Date: Thu, 20 May 2021 07:44:59 -0700 Subject: [PATCH 2/7] Fixed mypy issues --- mirror/data/stacktraces.py | 4 ++-- mypy.ini | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 mypy.ini diff --git a/mirror/data/stacktraces.py b/mirror/data/stacktraces.py index 511f26a..2f2cea5 100644 --- a/mirror/data/stacktraces.py +++ b/mirror/data/stacktraces.py @@ -5,7 +5,7 @@ import json import sqlite3 import sys -from typing import Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional import click from tqdm import tqdm @@ -130,7 +130,7 @@ def handler( if runtime is not None: extractor_names = [runtime] - result = {"like": like, "infile": infile, "data": []} + result: Dict[str, Any] = {"like": like, "infile": infile, "data": []} con = sqlite3.connect(infile) try: diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..73cd8cc --- /dev/null +++ b/mypy.ini @@ -0,0 +1,4 @@ +[mypy] + +[mypy-tqdm.*] +ignore_missing_imports = True From 86286385402c0a06eb9bfe8238e77cc3c43be441 Mon Sep 17 00:00:00 2001 From: Neeraj Kashyap Date: Thu, 20 May 2021 07:45:55 -0700 Subject: [PATCH 3/7] Changed module name to "bugout-mirror" --- setup.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 4fe1256..4bbe399 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from pkg_resources import parse_requirements from setuptools import find_packages, setup -MODULE_NAME = "mirror" +MODULE_NAME = "bugout-mirror" module = SourceFileLoader( MODULE_NAME, os.path.join(MODULE_NAME, "__init__.py") @@ -45,8 +45,6 @@ "requests", "tqdm", ], - extras_require={ - "dev": ["black", "mypy", "jupyter"] - }, + extras_require={"dev": ["black", "mypy", "jupyter"]}, entry_points={"console_scripts": ["{0} = {0}.cli:cli".format(MODULE_NAME)]}, ) From 8af9bc03b72a2f3d80471822164be3c7a0ca5343 Mon Sep 17 00:00:00 2001 From: Neeraj Kashyap Date: Thu, 20 May 2021 08:39:53 -0700 Subject: [PATCH 4/7] bugout-mirror -> bugout_mirror Because Python --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4bbe399..2d8061a 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from pkg_resources import parse_requirements from setuptools import find_packages, setup -MODULE_NAME = "bugout-mirror" +MODULE_NAME = "bugout_mirror" module = SourceFileLoader( MODULE_NAME, os.path.join(MODULE_NAME, "__init__.py") From 1344772d534ec288927a87cf9f24806e0fba64b3 Mon Sep 17 00:00:00 2001 From: Neeraj Kashyap Date: Thu, 20 May 2021 08:41:17 -0700 Subject: [PATCH 5/7] bugout_mirror -> mirror We'll figure this out later --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2d8061a..4fd8769 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from pkg_resources import parse_requirements from setuptools import find_packages, setup -MODULE_NAME = "bugout_mirror" +MODULE_NAME = "mirror" module = SourceFileLoader( MODULE_NAME, os.path.join(MODULE_NAME, "__init__.py") From 40bce53186639a8a2b4d87b1f56216317cbdf945 Mon Sep 17 00:00:00 2001 From: Neeraj Kashyap Date: Thu, 20 May 2021 08:41:49 -0700 Subject: [PATCH 6/7] MODULE_NAME is not package name --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4fd8769..b207cf5 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ long_description = ifp.read() setup( - name=MODULE_NAME, + name="bugout-mirror", version=module.__version__, author=module.__author__, author_email=module.__email__, From 0c27c532a61da07070bcfba3013ae13dcdcad73b Mon Sep 17 00:00:00 2001 From: Neeraj Kashyap Date: Thu, 20 May 2021 09:10:37 -0700 Subject: [PATCH 7/7] Fixed mypy error for click choices --- mirror/data/stacktraces.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mirror/data/stacktraces.py b/mirror/data/stacktraces.py index 2f2cea5..bd62120 100644 --- a/mirror/data/stacktraces.py +++ b/mirror/data/stacktraces.py @@ -85,7 +85,7 @@ def ipython_extractor(document: str) -> List[str]: @click.option( "-r", "--runtime", - type=click.Choice(EXTRACTORS, case_sensitive=False), + type=click.Choice(list(EXTRACTORS), case_sensitive=False), required=False, default=None, help="Runtime for which to extract stack traces from the given document",