سلام
وقت بخیر داخل ودیو این جلسه استاد دیتا هایی که به صورت دیکشنری بودن رو ذخیره کردن داخل دایرکتوری adv ولی من هرکاری کردم من جمله دیباگ کردن کدها که فایل جیسونی برای من درست و ذخیره نشد
import json
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
from parser import AdvertisementParser
BACE_LINKS = "https://{}.craigslist.org/search/hhh?availabilityMode=0&lang=fr&cc=fr#search=1~gallery~0~2"
class CrawlerBace(ABC):
@abstractmethod
def start(self, store=False):
pass
@abstractmethod
def store(self, data, filename=None):
pass
@staticmethod
def get(link):
try:
response = requests.get(link)
except requests.HTTPError:
return None
return response
class LinkCrawler(CrawlerBace):
def __init__(self, cites):
self.cites = cites
def find_links(self, html_doc):
soup = BeautifulSoup(html_doc, features='html.parser')
return soup.find_all('a')
def start_crawl(self, url):
response = self.get(url)
new_links = self.find_links(response.text)
return new_links
def start(self, store=False):
adv_link = list()
for city in self.cites:
links = self.start_crawl(BACE_LINKS.format(city))
print(f" city :{city} total: {len(links)}")
adv_link.extend(links)
if store:
self.store([li.get("href") for li in adv_link])
# links_text = [link.get_text() for link in links]
return adv_link
def store(self, data, filename=None):
with open("fixtures/data.json", "w") as f:
f.write(json.dumps(data))
class DataCrawle(CrawlerBace):
def __init__(self):
self.links = self.__load_link()
self.parserr = AdvertisementParser()
@staticmethod
def __load_link():
with open("fixtures/data.json", 'r') as f:
links = json.loads(f.read())
return links
def start(self, store=False):
for link in self.links:
response = requests.get(link)
data = self.parserr.parser(response.text)
if store:
self.store(data, data.get("post_id", "sample"))
def store(self, data, filename):
with open(f"fixtures/adv/{filename}.json", "w") as f:
f.write(json.dumps(data))
print(f"fixtures/adv/{filename}.json")
# print(data)
#,وقتی از این روش استور میکنم هیچ به غیر از یه فایل خالی به اسم(Anz.-ID, Id publi) بهم میده
#---------------------------------------------------------------------------------------------------------------
def store(self, data, filename):
with open("fixtures/adv/test.json", "w") as f:
f.write(json.dumps(data))
print("fixtures/adv/test.json")
# وقتی از این روش استور استفاده میکنم به من فقط یکی فایل جیسون میده
# {
# "titel": "10 room, several air conditioned bedrooms, summer rental August only",
# "prise": "\u20ac1.000",
# "post_id": "Anz.-ID: 7762557592",
# "creat_time": "2024-07-02T23:24:50+0200",
# "modified_time": null
# }
from bs4 import BeautifulSoup
class AdvertisementParser:
def __init__(self):
self.soup = None
@property
def titel(self):
titel_tag = self.soup.find('span', attrs={"id": "titletextonly"})
if titel_tag:
return titel_tag.text
@property
def price(self):
price_tag = self.soup.find("span", attrs={"class": "price"})
if price_tag:
return price_tag.text
@property
def body(self):
body_tag = self.soup.select_one("#postingbody")
if body_tag:
return body_tag.text
@property
def post_id(self):
selector = "body > section > section > section > div.postinginfos > p:nth-child(1)"
id_tag = self.soup.select_one(selector)
if id_tag:
return id_tag.text.replace('post id', '')
@property
def creat_time(self):
selctor = "body > section > section > section > div.postinginfos > p.postinginfo.reveal > time"
time = self.soup.select_one(selctor)
if time:
return time.attrs["datetime"]
def parser(self, html_data):
self.soup = BeautifulSoup(html_data, "html.parser")
data = dict(
titel=self.titel, prise=self.price, post_id=self.post_id,
creat_time=self.creat_time, modified_time=None
)
return data
وقتی سوویچ find_link صدا میزنم لینک هایی که میگیره داخلش / و # هست که به صورت دستی پاک میکنم و بعدش سوویچ extract_pages صدا میزنم
import sys
from com import LinkCrawler, DataCrawle
# def get_pages_data():
# raise NotImplemented()
if __name__ == "__main__":
switch = sys.argv[1]
if switch == "find_link":
crawler = LinkCrawler(cites=['berlin', 'paris'])
crawler.start(store = True)
elif switch == "extract_pages":
cr = DataCrawle()
cr.start(store = True)