Merge pull request #4 from sudo-arash/sudo-arash-patch-1

sudo-arash · web-flow · commit 72bb0998c009 · 2024-09-25T17:13:48.000+03:30
Update program to fix unralted sentences
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,15 @@
+steps:
+- uses: actions/checkout@v4
+- name: Set up Python
+  uses: actions/setup-python@v5
+  with:
+    python-version: '3.x'
+- name: Install dependencies
+  run: |
+    python -m pip install --upgrade pip
+    pip install spacy requests nltk
+    python -m spacy download en_core_web_lg
+- name: Test with pytest
+  run: |
+    pip install pytest pytest-cov
+    pytest test.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
diff --git a/main.py b/main.py
@@ -1,6 +1,13 @@
+import spacy
 import requests
 import random
 import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import nltk
+
+# Load the NLP model
+nlp = spacy.load("en_core_web_lg")
+nltk.download("punkt")
 
 # Configure logging
 logging.basicConfig(
@@ -9,27 +16,16 @@
     datefmt="%Y-%m-%d %H:%M:%S"
 )
 
-# API URLs for Datamuse
 DATAMUSE_API = "https://api.datamuse.com/words"
 STOP_WORDS = {"the", "a", "an", "and", "of", "in", "on", "at", "to", "is", "for"}
 
-def get_words_by_topic(topic_word, part_of_speech=None):
-    """Fetch words related to a given topic word using Datamuse API, optionally filtered by part of speech."""
-    logging.info(f"Fetching related {part_of_speech or 'words'} for topic: {topic_word}")
-
-    params = {"ml": topic_word}
-    if part_of_speech:
-        params["sp"] = f"*{part_of_speech}"
-
+def fetch_api_data(params):
+    """Helper function to fetch data from the Datamuse API."""
     response = requests.get(DATAMUSE_API, params=params)
-
     if response.status_code == 200:
-        data = response.json()
-        words = [item['word'] for item in data if 'word' in item]
-        logging.debug(f"Received {len(words)} {part_of_speech or 'words'} related to {topic_word}")
-        return words
+        return response.json()
     else:
-        logging.error(f"Failed to fetch related words for {topic_word}. Status code: {response.status_code}")
+        logging.error(f"API request failed with status code {response.status_code}.")
         return []
 
 def clean_topic_input(topic_sentence):
@@ -40,13 +36,18 @@ def clean_topic_input(topic_sentence):
     logging.debug(f"Filtered words: {filtered_words}")
     return filtered_words
 
-def determine_topics(words):
+def determine_topics(words, use_threads=False):
     """Analyze the topics of filtered words using the Datamuse API."""
     topics = set()
 
-    for word in words:
-        # Make the API call to get related topics from Datamuse
-        try:
+    if use_threads:
+        with ThreadPoolExecutor() as executor:
+            futures = [executor.submit(fetch_api_data, {"rel_trg": word}) for word in words]
+            for future in as_completed(futures):
+                data = future.result()
+                topics.update([item['word'] for item in data])
+    else:
+        for word in words:
             logging.info(f"Fetching topics for word: {word}")
             response = requests.get(f"{DATAMUSE_API}?rel_trg={word}")
             if response.status_code == 200:
@@ -55,53 +56,101 @@ def determine_topics(words):
                 logging.debug(f"Related topics found for {word}: {[item['word'] for item in data]}")
             else:
                 logging.warning(f"Datamuse API error for word: {word}. Status code: {response.status_code}")
-        except Exception as e:
-            logging.error(f"Error fetching topics for {word}: {e}")
 
-    return list(topics)  # Return a list of unique topics
+    return list(topics)
 
-def create_sentence(topic):
-    """Form a sentence from related words with a specific structure: 2 nouns, 1 verb, 2 adjectives (optional), 1 adverb."""
-
-    # Fetch at least 2 nouns, 1 verb, 2 adjectives (optional), and 1 adverb related to the topic
-    nouns = get_words_by_topic(topic, 'n') or ['thing']
-    verbs = get_words_by_topic(topic, 'v') or ['does']
-    adjectives = get_words_by_topic(topic, 'adj') or []
-    adverbs = get_words_by_topic(topic, 'adv') or ['']
-
-    if len(nouns) < 2:
-        logging.warning(f"Not enough nouns for {topic}, filling with defaults.")
-        nouns.extend(['object', 'entity'])
+def is_word_related_to_topic(word, topic):
+    """Check if the word is closely related to the given topic."""
+    logging.info(f"Checking if word '{word}' is related to topic '{topic}'")
+    response = requests.get(f"{DATAMUSE_API}?ml={topic}&max=10")
+    if response.status_code == 200:
+        related_words = [item['word'] for item in response.json()]
+        return word in related_words
+    return False
 
-    if not verbs:
-        logging.warning(f"No verbs found for {topic}, filling with default.")
-        verbs.append('acts')
+def get_words_by_topic(topic_word, part_of_speech=None, max_words=10, use_threads=False):
+    """Fetch a limited number of words related to a given topic word using Datamuse API, optionally filtered by part of speech."""
+    logging.info(f"Fetching related {part_of_speech or 'words'} for topic: {topic_word}")
 
-    # Select 2 nouns, 1 verb, up to 2 adjectives, and 1 adverb to form a structured sentence
-    chosen_nouns = random.choices(nouns, k=2)
-    chosen_verb = random.choice(verbs)
-    chosen_adjectives = random.choices(adjectives, k=min(2, len(adjectives)))
-    chosen_adverb = random.choice(adverbs)
+    params = {"ml": topic_word, "max": max_words}
+    if part_of_speech:
+        params["sp"] = f"*{part_of_speech}"
 
-    # Construct the sentence
-    sentence = f"{chosen_adjectives[0] if chosen_adjectives else ''} {chosen_nouns[0]} {chosen_adjectives[1] if len(chosen_adjectives) > 1 else ''} {chosen_verb} {chosen_nouns[1]} {chosen_adverb}."
-    sentence = " ".join(sentence.split())  # Clean up any extra spaces
-    logging.debug(f"Generated sentence: {sentence}")
+    if use_threads:
+        with ThreadPoolExecutor() as executor:
+            future = executor.submit(fetch_api_data, params)
+            return [item['word'] for item in future.result() if 'word' in item]
 
-    return sentence.capitalize()
+    # Sequential fetch (default)
+    response = requests.get(DATAMUSE_API, params=params)
+    if response.status_code == 200:
+        return [item['word'] for item in response.json() if 'word' in item]
+    else:
+        logging.error(f"Failed to fetch related words for {topic_word}. Status code: {response.status_code}")
+        return []
 
-def generate_paragraphs(topics, num_paragraphs=5):
+def nlp_based_sentence(template_sentence, topic):
+    """
+    Improve sentence structure using NLP.
+    - Use Spacy for parsing and replacing with more natural words.
+    - Fill sentence templates more intelligently.
+    """
+    doc = nlp(template_sentence)
+    logging.debug(f"Original Sentence Structure: {[token.text for token in doc]}")
+
+    # Create a dictionary of POS -> words from Datamuse API
+    pos_map = {
+        "NOUN": get_words_by_topic(topic, 'n', max_words=10),
+        "VERB": get_words_by_topic(topic, 'v', max_words=5),
+        "ADJ": get_words_by_topic(topic, 'adj', max_words=5),
+        "ADV": get_words_by_topic(topic, 'adv', max_words=5)
+    }
+
+    # Default fallback words
+    pos_fallback = {
+        "NOUN": ["thing", "object", "item"],
+        "VERB": ["does", "is"],
+        "ADJ": ["nice", "good"],
+        "ADV": ["quickly"]
+    }
+
+    # Construct the sentence by replacing each part of speech
+    generated_sentence = []
+    for token in doc:
+        if token.pos_ in pos_map and pos_map[token.pos_] and pos_map[token.pos_]:
+            generated_sentence.append(pos_map[token.pos_].pop(0))  # Sequential selection
+        else:
+            generated_sentence.append(token.text)  # Keep the original if no word is available
+
+    final_sentence = " ".join(generated_sentence)
+    logging.debug(f"Generated NLP Sentence: {final_sentence}")
+    return final_sentence.capitalize()
+
+def create_sentence(topic, use_threads=False):
+    """
+    Form a sentence using NLP techniques to make it more natural.
+    Structure: 2 nouns, 1 verb, 2 adjectives, 1 adverb.
+    """
+    # Template sentence to be modified by NLP
+    template_sentence = "The [ADJ] [NOUN] [VERB] the [NOUN] [ADV]."
+
+    # Use NLP to parse and replace placeholders with more natural words
+    return nlp_based_sentence(template_sentence, topic)
+
+def generate_paragraphs(topics, num_paragraphs=5, use_threads=False):
     """Generate paragraphs by fetching related words for each topic and forming sentences."""
     paragraphs = []
-
     logging.info(f"Generating {num_paragraphs} paragraphs.")
+
     for _ in range(num_paragraphs):
         paragraph = []
         selected_topic = topics[random.randint(0, len(topics)-1)]
         logging.info(f"Generating sentences for topic: {selected_topic}")
-        for _ in range(random.randint(2, 3)):  # 2 to 3 sentences per paragraph
-            paragraph.append(create_sentence(selected_topic))
-        paragraphs.append(" ".join(paragraph))  # Join sentences into a paragraph
+
+        for _ in range(2):  # Fixed at 2 sentences per paragraph
+            paragraph.append(create_sentence(selected_topic, use_threads=use_threads))
+
+        paragraphs.append(" ".join(paragraph))
     return paragraphs
 
 def main():
@@ -122,14 +171,16 @@ def main():
 \____   )MMMMMM|   .'
      `-'       `--' hjm
     """)
+
     # Get user input for the topic sentence
     user_input = input("Enter a topic: ")
 
     # Step 1: Clean the input by removing stop words (articles, conjunctions, etc.)
     filtered_words = clean_topic_input(user_input)
 
     # Step 2: Determine topics from the filtered words
-    topics = determine_topics(filtered_words)
+    use_threads = input("Do you want to use threads to speed up (yes/no)? ").lower() == "yes"
+    topics = determine_topics(filtered_words, use_threads=use_threads)
 
     if not topics:
         logging.warning("No valid topics found. Please enter a valid input.")
@@ -138,7 +189,7 @@ def main():
     logging.info(f"Identified topics: {topics}")
 
     # Step 3: Generate paragraphs based on the topics
-    paragraphs = generate_paragraphs(topics)
+    paragraphs = generate_paragraphs(topics, use_threads=use_threads)
 
     # Step 4: Output the generated paragraphs
     logging.info("Generated paragraphs successfully.")
diff --git a/test.py b/test.py