Update - Add LazySources for Lazy Data Sources

trisongz · trisongz · commit 5ee7498f3dce · 2021-07-16T17:25:22.000-05:00
diff --git a/lazyops/lazyio/models.py b/lazyops/lazyio/models.py
@@ -191,7 +191,6 @@ def readlines(self,  *args, **kwargs):
     def get_num_lines(self):
         return sum(1 for _ in File.tflines(self._filename))
 
-    @timed_cache(10)
     @property
     def filesize(self):
         self._ensure_open()
diff --git a/lazyops/lazysources/README.md b/lazyops/lazysources/README.md
@@ -0,0 +1,5 @@
+# LazySources
+
+Lazy Data Sources
+
+- `lazyops.lazysources.gdelt`: [GDELT](https://www.gdeltproject.org/)
diff --git a/lazyops/lazysources/__init__.py b/lazyops/lazysources/__init__.py
@@ -0,0 +1,5 @@
+from . import gdelt
+
+__all__ = [
+    'gdelt'
+]
diff --git a/lazyops/lazysources/gdelt/README.md b/lazyops/lazysources/gdelt/README.md
@@ -0,0 +1,121 @@
+# LazySources - GDelt
+
+Lazy Data Source for [GDELT](https://www.gdeltproject.org/)
+
+Extended from [gdelt-doc-api](https://github.com/alex9smith/gdelt-doc-api).
+Credit to original authors @alex9smith and @FelixKleineBoesing
+
+- Async Support
+- Multiple Result Formats Available
+    - Dict
+    - Pandas Dataframe
+    - JSON string
+    - Object: GDELTArticle class
+        - Can be called on to fully parse the URL
+        - `article = articles[0]; article.parse()`
+
+
+API client for the GDELT 2.0 Doc API. Supports Async Methods
+
+```python
+from lazyops.lazysources.gdelt import GDELT, GDELTFilters
+
+# Formats = [
+# 'obj', # GDELTArticle which can be called to extract the url
+# 'json', Pure JSON String Output
+# 'dict', Pyhon Dict
+# 'pd', Pandas DF
+# ]
+
+f = GDELTFilters(
+    keyword = "climate change",
+    start_date = "2021-05-10",
+    end_date = "2021-05-15"
+)
+gd = GDELT(result_format='obj')
+
+# Search for articles matching the filters
+articles = gd.article_search(f)
+
+# Or call method .search directly
+articles = gd.search(method='article', filters=f)
+
+# Async Methods
+articles = await gd.async_search(method='article', filters=f)
+articles = await gd.async_article_search(f)
+
+# Parsing Articles - Syncronous
+english_articles = [i for i in articles if i.language == 'English']
+
+for article in english_articles:
+    article.parse()
+    print(article.text)
+
+# Parsing Articles - Asyncronous
+english_articles = [await article.async_parse() for article in english_articles]
+
+
+# Get a timeline of the number of articles matching the filters
+# timeline = gd.timeline_search("timelinevol", f)
+
+
+```
+
+### Article List
+The article list mode of the API generates a list of news articles that match the filters.
+The client returns this as a pandas DataFrame with columns `url`, `url_mobile`, `title`,
+`seendate`, `socialimage`, `domain`, `language`, `sourcecountry`.
+
+### Timeline Search
+There are 5 available modes when making a timeline search:
+* `timelinevol` - a timeline of the volume of news coverage matching the filters,
+    represented as a percentage of the total news articles monitored by GDELT.
+* `timelinevolraw` - similar to `timelinevol`, but has the actual number of articles
+    and a total rather than a percentage
+* `timelinelang` - similar to `timelinevol` but breaks the total articles down by published language.
+    Each language is returned as a separate column in the DataFrame.
+* `timelinesourcecountry` - similar to `timelinevol` but breaks the total articles down by the country
+    they were published in. Each country is returned as a separate column in the DataFrame.
+* `timelinetone` - a timeline of the average tone of the news coverage matching the filters.
+    See [GDELT's documentation](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/)
+    for more information about the tone metric.
+
+### Construct filters for the GDELT API.
+Filters for `keyword`, `domain`, `domain_exact`, `country` and `theme`
+can be passed either as a single string or as a list of strings. If a list is
+passed, the values in the list are wrapped in a boolean OR.
+Params
+------
+* `start_date`
+    The start date for the filter in YYYY-MM-DD format. The API officially only supports the
+    most recent 3 months of articles. Making a request for an earlier date range may still
+    return data, but it's not guaranteed.
+    Must provide either `start_date` and `end_date` or `timespan`
+* `end_date`
+    The end date for the filter in YYYY-MM-DD format.
+* `timespan`
+    A timespan to search for, relative to the time of the request. Must match one of the API's timespan
+    formats - https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/
+    Must provide either `start_date` and `end_date` or `timespan`
+* `num_records`
+    The number of records to return. Only used in article list mode and can be up to 250.
+* `keyword`
+    Return articles containing the exact phrase `keyword` within the article text.
+* `domain`
+    Return articles from the specified domain. Does not require an exact match so
+    passing "cnn.com" will match articles from "cnn.com", "subdomain.cnn.com" and "notactuallycnn.com".
+* `domain_exact`
+    Similar to `domain`, but requires an exact match.
+* `near`
+    Return articles containing words close to each other in the text. Use `near()` to construct.
+    eg. near = near(5, "airline", "climate").
+* `repeat`
+    Return articles containing a single word repeated at least a number of times. Use `repeat()`
+    to construct. eg. repeat = repeat(3, "environment").
+    If you want to construct a filter with multiple repeated words, construct with `multi_repeat()`
+    instead. eg. repeat = multi_repeat([(2, "airline"), (3, "airport")], "AND")
+* `country`
+    Return articles published in a country, formatted as the FIPS 2 letter country code.
+* `theme`
+    Return articles that cover one of GDELT's GKG Themes. A full list of themes can be
+    found here: http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT
diff --git a/lazyops/lazysources/gdelt/__init__.py b/lazyops/lazysources/gdelt/__init__.py
@@ -0,0 +1,13 @@
+from .core import GDELT
+from .filters import GDELTFilters, near, repeat, multi_repeat
+from .models import GDELTArticle, Article
+
+__all__ = [
+    'GDELT',
+    'GDELTFilters',
+    'near',
+    'repeat',
+    'multi_repeat',
+    'GDELTArticle',
+    'Article'
+]
diff --git a/lazyops/lazysources/gdelt/_base.py b/lazyops/lazysources/gdelt/_base.py
@@ -0,0 +1,17 @@
+
+from lazyops import lazy_init, get_logger, timed_cache, LazyObject
+lazy_init('pandas')
+
+import pandas as pd
+
+from enum import Enum
+from typing import Dict, Optional, List, Union, Tuple
+
+from dataclasses import dataclass
+from lazyops.apis import LazySession, async_req
+from lazyops.lazyio import LazyJson
+from lazyops.lazyclasses import lazyclass
+
+
+logger = get_logger('LazySources', 'GDELT')
+
diff --git a/lazyops/lazysources/gdelt/core.py b/lazyops/lazysources/gdelt/core.py
@@ -0,0 +1,141 @@
+from ._base import *
+
+from .filters import GDELTFilters
+from .models import GDELTArticle
+
+class GDELTFormat(Enum):
+    dict = 'dict'
+    obj = 'obj'
+    json = 'json'
+    pandas = 'pd'
+
+
+class GDELTMethods(Enum):
+    article = 'article'
+    timeline = 'timeline'
+
+class GDELT:
+    api_url = 'https://api.gdeltproject.org/api/v2/doc/doc'
+    available_modes = ["artlist", "timelinevol", "timelinevolraw", "timelinetone", "timelinelang", "timelinesourcecountry"]
+
+    def __init__(self, result_format: GDELTFormat = GDELTFormat.obj, json_parsing_max_depth: int = 100, *args, **kwargs) -> None:
+        self.max_depth_json_parsing = json_parsing_max_depth
+        self._output_format = result_format
+        self.sess = LazySession()
+
+    def return_article_result(self, articles: Dict = None):
+        if not articles or not articles.get('articles'):
+            return None
+        if self._output_format.value == 'dict':
+            return articles['articles']
+        
+        if self._output_format.value == 'pd':
+            return pd.DataFrame(articles["articles"])
+        
+        if self._output_format.value == 'json':
+            return LazyJson.dumps(articles['articles'])
+        
+        if self._output_format.value == 'obj':
+            return [GDELTArticle(**article) for article in articles['articles']]
+    
+    def return_timeline_search(self, results: Dict = None):
+        if not results:
+            return None
+        
+        if self._output_format.value == 'dict':
+            return results
+        
+        if self._output_format.value == 'pd':
+            formatted = pd.DataFrame(results)
+            formatted["datetime"] = pd.to_datetime(formatted["datetime"])
+            return formatted
+        
+        if self._output_format.value == 'json':
+            return LazyJson.dumps(results)
+        
+        if self._output_format.value == 'obj':
+            return [LazyObject(res) for res in results]
+
+
+    def article_search(self, filters: GDELTFilters) -> Union[pd.DataFrame, Dict, str]:
+        articles = self._query("artlist", filters.query_string)
+        return self.return_article_result(articles)
+
+    def timeline_search(self, mode: str, filters: GDELTFilters) -> Union[pd.DataFrame, Dict, str]:
+        timeline = self._query(mode, filters.query_string)
+        results = {"datetime": [entry["date"] for entry in timeline["timeline"][0]["data"]]}
+        for series in timeline["timeline"]:
+            results[series["series"]] = [entry["value"] for entry in series["data"]]
+
+        if mode == "timelinevolraw": results["All Articles"] = [entry["norm"] for entry in timeline["timeline"][0]["data"]]
+        return self.return_timeline_search(results)
+    
+    def search(self, method: GDELTMethods, filters: GDELTFilters) -> Union[pd.DataFrame, Dict, str]:
+        if method.value == 'article':
+            return self.article_search(filters)
+        if method.value == 'timeline':
+            return self.timeline_search(filters)
+
+    async def async_search(self, method: GDELTMethods, filters: GDELTFilters) -> Union[pd.DataFrame, Dict, str]:
+        if method.value == 'article':
+            return await self.async_article_search(filters)
+        if method.value == 'timeline':
+            return await self.async_timeline_search(filters)
+
+    async def async_article_search(self, filters: GDELTFilters) -> Union[pd.DataFrame, Dict, str]:
+        articles = await self._async_query("artlist", filters.query_string)
+        return self.return_article_result(articles)
+
+    async def async_timeline_search(self, mode: str, filters: GDELTFilters) -> Union[pd.DataFrame, Dict, str]:
+        timeline = await self._async_query(mode, filters.query_string)
+        results = {"datetime": [entry["date"] for entry in timeline["timeline"][0]["data"]]}
+        for series in timeline["timeline"]:
+            results[series["series"]] = [entry["value"] for entry in series["data"]]
+
+        if mode == "timelinevolraw": results["All Articles"] = [entry["norm"] for entry in timeline["timeline"][0]["data"]]
+        return self.return_timeline_search(results)
+    
+    def _decode_json(cls, content, max_recursion_depth: int = 100, recursion_depth: int = 0):
+        try:
+            result = LazyJson.loads(content, recursive=True)
+        except Exception as e:
+            if recursion_depth >= max_recursion_depth:
+                raise ValueError("Max Recursion depth is reached. JSON can´t be parsed!")
+            idx_to_replace = int(e.pos)
+            if isinstance(content, bytes): content.decode("utf-8")
+            json_message = list(content)
+            json_message[idx_to_replace] = ' '
+            new_message = ''.join(str(m) for m in json_message)
+            return GDELT._decode_json(content=new_message, max_recursion_depth=max_recursion_depth, recursion_depth=recursion_depth+1)
+        return result
+    
+    async def _async_decode_json(cls, content, max_recursion_depth: int = 100, recursion_depth: int = 0):
+        try:
+            result = LazyJson.loads(content, recursive=True)
+        except Exception as e:
+            if recursion_depth >= max_recursion_depth:
+                raise ValueError("Max Recursion depth is reached. JSON can´t be parsed!")
+            idx_to_replace = int(e.pos)
+            if isinstance(content, bytes): content.decode("utf-8")
+            json_message = list(content)
+            json_message[idx_to_replace] = ' '
+            new_message = ''.join(str(m) for m in json_message)
+            return await GDELT._async_decode_json(content=new_message, max_recursion_depth=max_recursion_depth, recursion_depth=recursion_depth+1)
+        return result
+
+    def _query(self, mode: str, query_string: str) -> Dict:
+        if mode not in GDELT.available_modes:
+            raise ValueError(f"Mode {mode} not in supported API modes")
+        resp = self.sess.fetch(url=GDELT.api_url, decode_json=False, method='GET', params={'query': query_string, 'mode': mode, 'format': 'json'})
+        if resp.status_code not in [200, 202]:
+            raise ValueError("The gdelt api returned a non-successful status code. This is the response message: {}".format(resp.text))
+        return self._decode_json(resp.content, max_recursion_depth=self.max_depth_json_parsing)
+    
+    async def _async_query(self, mode: str, query_string: str) -> Dict:
+        if mode not in GDELT.available_modes:
+            raise ValueError(f"Mode {mode} not in supported API modes")
+        resp = await self.sess.async_fetch(url=GDELT.api_url, decode_json=False, method='GET', params={'query': query_string, 'mode': mode, 'format': 'json'})
+        if resp.status_code not in [200, 202]:
+            raise ValueError("The gdelt api returned a non-successful status code. This is the response message: {}".format(resp.text))
+        return await self._async_decode_json(resp.content, max_recursion_depth=self.max_depth_json_parsing)
+
diff --git a/lazyops/lazysources/gdelt/filters.py b/lazyops/lazysources/gdelt/filters.py
diff --git a/lazyops/lazysources/gdelt/models.py b/lazyops/lazysources/gdelt/models.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +from . import gdelt
++
 +__all__ = [
 +    'gdelt'
 +]