ذخیره نشدن فایل ها

سلام

وقت بخیر داخل ودیو این جلسه استاد دیتا هایی که به صورت دیکشنری بودن رو ذخیره کردن داخل دایرکتوری adv ولی من هرکاری کردم من جمله دیباگ کردن کد‌ها که فایل جیسونی برای من درست و ذخیره نشد

import json
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
from parser import AdvertisementParser
BACE_LINKS = "https://{}.craigslist.org/search/hhh?availabilityMode=0&lang=fr&cc=fr#search=1~gallery~0~2"
class CrawlerBace(ABC):
    @abstractmethod
    def start(self, store=False):
        pass
    @abstractmethod
    def store(self, data, filename=None):
        pass
    @staticmethod
    def get(link):
        try:
            response = requests.get(link)
        except requests.HTTPError:
            return None
        return response
class LinkCrawler(CrawlerBace):
    def __init__(self, cites):
        self.cites = cites
    def find_links(self, html_doc):
        soup = BeautifulSoup(html_doc, features='html.parser')
        return soup.find_all('a')
    def start_crawl(self, url):
        response = self.get(url)
        new_links = self.find_links(response.text)
        return new_links
    def start(self, store=False):
        adv_link = list()
        for city in self.cites:
            links = self.start_crawl(BACE_LINKS.format(city))
            print(f" city :{city} total: {len(links)}")
            adv_link.extend(links)
        if store:
            self.store([li.get("href") for li in adv_link])
            # links_text = [link.get_text() for link in links]
        return adv_link
    def store(self, data, filename=None):
        with open("fixtures/data.json", "w") as f:
            f.write(json.dumps(data))
class DataCrawle(CrawlerBace):
    def __init__(self):
        self.links = self.__load_link()
        self.parserr = AdvertisementParser()
    @staticmethod
    def __load_link():
        with open("fixtures/data.json", 'r') as f:
            links = json.loads(f.read())
            return links
    def start(self, store=False):
        for link in self.links:
            response = requests.get(link)
            data = self.parserr.parser(response.text)
            if store:
                self.store(data, data.get("post_id", "sample"))
    def store(self, data, filename):
        with open(f"fixtures/adv/{filename}.json", "w") as f:
            f.write(json.dumps(data))
        print(f"fixtures/adv/{filename}.json")
        # print(data)
        #,وقتی از این روش استور میکنم هیچ به غیر از یه فایل خالی به اسم(Anz.-ID,  Id publi) بهم میده
#---------------------------------------------------------------------------------------------------------------
    def store(self, data, filename):
        with open("fixtures/adv/test.json", "w") as f:
            f.write(json.dumps(data))
        print("fixtures/adv/test.json")
        # وقتی از این روش استور استفاده میکنم به من فقط یکی فایل جیسون میده
        # {
        #     "titel": "10 room, several air conditioned bedrooms,  summer rental August only",
        #     "prise": "\u20ac1.000",
        #     "post_id": "Anz.-ID: 7762557592",
        #     "creat_time": "2024-07-02T23:24:50+0200",
        #     "modified_time": null
        # }

from bs4 import BeautifulSoup
class AdvertisementParser:
    def __init__(self):
        self.soup = None
    @property
    def titel(self):
        titel_tag = self.soup.find('span', attrs={"id": "titletextonly"})
        if titel_tag:
            return titel_tag.text
    @property
    def price(self):
        price_tag = self.soup.find("span", attrs={"class": "price"})
        if price_tag:
            return price_tag.text
    @property
    def body(self):
        body_tag = self.soup.select_one("#postingbody")
        if body_tag:
            return body_tag.text
    @property
    def post_id(self):
        selector = "body > section > section > section > div.postinginfos > p:nth-child(1)"
        id_tag = self.soup.select_one(selector)
        if id_tag:
            return id_tag.text.replace('post id', '')
    @property
    def creat_time(self):
        selctor = "body > section > section > section > div.postinginfos > p.postinginfo.reveal > time"
        time = self.soup.select_one(selctor)
        if time:
            return time.attrs["datetime"]
    def parser(self, html_data):
        self.soup = BeautifulSoup(html_data, "html.parser")
        data = dict(
            titel=self.titel, prise=self.price, post_id=self.post_id,
            creat_time=self.creat_time, modified_time=None
        )
        return data

وقتی سوویچ find_link صدا میزنم لینک هایی که میگیره داخلش / و # هست که به صورت دستی پاک میکنم و بعدش سوویچ extract_pages صدا میزنم

import sys
from com import LinkCrawler, DataCrawle
# def get_pages_data():
    # raise NotImplemented()
if __name__ == "__main__":
    switch = sys.argv[1]
    if switch == "find_link":
        crawler = LinkCrawler(cites=['berlin', 'paris'])
        crawler.start(store = True)
    elif switch == "extract_pages":
        cr = DataCrawle()
        cr.start(store = True)

def store(self, data, filename): # ایجاد دایرکتوری در صورت عدم وجود directory = "fixtures/adv" if not os.path.exists(directory): os.makedirs(directory) # جایگزینی کاراکترهای غیرمجاز در نام فایل filename = "".join(c if c.isalnum() or c in (' ', '.', '_') else '_' for c in filename) filepath = os.path.join(directory, f"{filename}.json") with open(filepath, "w") as f: f.write(json.dumps(data, ensure_ascii=False, indent=4)) print(filepath)

import os import json from abc import ABC, abstractmethod import requests from bs4 import BeautifulSoup BACE_LINKS = "https://{}.craigslist.org/search/hhh?availabilityMode=0&lang=fr&cc=fr#search=1~gallery~0~2" class CrawlerBace(ABC): @abstractmethod def start(self, store=False): pass @abstractmethod def store(self, data, filename=None): pass @staticmethod def get(link): try: response = requests.get(link) except requests.HTTPError: return None return response class LinkCrawler(CrawlerBace): def __init__(self, cites): self.cites = cites def find_links(self, html_doc): soup = BeautifulSoup(html_doc, features='html.parser') return soup.find_all('a') def start_crawl(self, url): response = self.get(url) new_links = self.find_links(response.text) return new_links def start(self, store=False): adv_link = list() for city in self.cites: links = self.start_crawl(BACE_LINKS.format(city)) print(f"city: {city}, total: {len(links)}") adv_link.extend(links) if store: valid_links = [li.get("href") for li in adv_link if li.get("href") and li.get("href").startswith("http")] self.store(valid_links) return adv_link def store(self, data, filename=None): if not filename: filename = "fixtures/data.json" with open(filename, "w") as f: f.write(json.dumps(data)) class DataCrawle(CrawlerBace): def __init__(self): self.links = self.__load_link() self.parserr = AdvertisementParser() @staticmethod def __load_link(): with open("fixtures/data.json", 'r') as f: links = json.loads(f.read()) return links def start(self, store=False): try: for link in self.links: if not link.startswith("http"): print(f"Skipping invalid URL: {link}") continue response = requests.get(link) data = self.parserr.parser(response.text) print(data) # چاپ داده‌ها برای دیباگ if store: self.store(data, data.get("post_id", "sample")) except KeyboardInterrupt: print("Process interrupted by user. Exiting gracefully...") return def store(self, data, filename): # ایجاد دایرکتوری در صورت عدم وجود directory = "fixtures/adv" if not os.path.exists(directory): os.makedirs(directory) # جایگزینی کاراکترهای غیرمجاز در نام فایل filename = "".join(c if c.isalnum() or c in (' ', '.', '_') else '_' for c in filename) filepath = os.path.join(directory, f"{filename}.json") with open(filepath, "w") as f: f.write(json.dumps(data, ensure_ascii=False, indent=4)) print(filepath) class AdvertisementParser: def __init__(self): self.soup = None @property def titel(self): titel_tag = self.soup.find('span', attrs={"id": "titletextonly"}) if titel_tag: return titel_tag.text @property def price(self): price_tag = self.soup.find("span", attrs={"class": "price"}) if price_tag: return price_tag.text @property def body(self): body_tag = self.soup.select_one("#postingbody") if body_tag: return body_tag.text @property def post_id(self): selector = "body > section > section > section > div.postinginfos > p:nth-child(1)" id_tag = self.soup.select_one(selector) if id_tag: return id_tag.text.replace('post id', '').strip() @property def creat_time(self): selector = "body > section > section > section > div.postinginfos > p.postinginfo.reveal > time" time = self.soup.select_one(selector) if time: return time.attrs["datetime"] def parser(self, html_data): self.soup = BeautifulSoup(html_data, "html.parser") data = dict( titel=self.titel, price=self.price, post_id=self.post_id, creat_time=self.creat_time, modified_time=None ) return data

import sys from com import LinkCrawler, DataCrawle if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python main.py [find_link | extract_pages]") sys.exit(1) switch = sys.argv[1] if switch == "find_link": crawler = LinkCrawler(cites=['berlin', 'paris']) crawler.start(store=True) elif switch == "extract_pages": cr = DataCrawle() cr.start(store=True) else: print("Invalid option. Use 'find_link' or 'extract_pages'.") sys.exit(1)