Spaces:
Running
Running
| from typing import List | |
| from haystack import component, Document | |
| from newspaper import Article | |
| import requests | |
| class HackernewsFetcher(): | |
| def run(self, top_k: int): | |
| newest_list = requests.get(url='https://hacker-news.firebaseio.com/v0/topstories.json?print=pretty') | |
| articles = [] | |
| for id in newest_list.json()[0:top_k]: | |
| article = requests.get(url=f"https://hacker-news.firebaseio.com/v0/item/{id}.json?print=pretty") | |
| if 'url' in article.json(): | |
| articles.append(article.json()['url']) | |
| docs = [] | |
| for url in articles: | |
| try: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| docs.append(Document(content=article.text, meta={'title': article.title, 'url': url})) | |
| except: | |
| print(f"Couldn't download {url}, skipped") | |
| return {'articles': docs} | |