Coverage for backend / app / job_email_scraping / job_scrapers / veganjobs.py: 22%
46 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
1"""veganjobs.com job scraper module"""
3import re
5import cloudscraper
6from bs4 import BeautifulSoup
8from app.job_email_scraping.schemas import Salary, JobInfo, JobResult
11class VeganJobsJobScraper:
12 """Scraper for veganjobs.com job listings."""
14 base_url = "https://veganjobs.com/job/"
16 def __init__(self, job_ids: str | list[str]) -> None:
17 """Initialize the scraper with headers and delay settings.
18 :param job_ids: The job ID(s)"""
20 self.scraper = cloudscraper.create_scraper()
21 self.job_ids = [job_ids] if isinstance(job_ids, str) else job_ids
22 self.job_urls = [f"{self.base_url}{job_id}" for job_id in self.job_ids]
24 def scrape_job_listing(self, job_url: str) -> JobResult:
25 """Scrape job data from a specific veganjobs.com job listing URL
26 :param job_url: The URL of the job listing to scrape"""
28 response = self.scraper.get(job_url)
29 response.raise_for_status()
31 soup = BeautifulSoup(response.content, "html.parser")
33 # Defaults
34 company = None
35 title = None
36 location = None
38 # Title
39 title_tag = soup.find("h2", class_="page-title")
40 if title_tag:
41 title = title_tag.get_text(strip=True)
43 # Company
44 company_tag = soup.find("div", class_="joblisting-meta-company-name")
45 if company_tag:
46 company = company_tag.get_text(strip=True)
48 # Location
49 location_tag = soup.find("li", class_="location")
50 if location_tag:
51 location = location_tag.get_text(strip=True)
53 # Full text block
54 container = soup.find("div", class_="job_listing-description")
55 # noinspection PyArgumentList
56 text_content = container.get_text(separator="\n", strip=True) if container else ""
58 # Salary
59 # salary_match = re.search(r"Salary:\s*(.+)", text_content)
60 # salary_raw = salary_match.group(1).split("\n")[0] if salary_match else None
62 # Description (remove salary)
63 description = re.sub(r"Salary:.*", "", text_content, flags=re.DOTALL).strip()
64 description = description.strip("Overwiew").strip()
66 return JobResult(
67 company=company,
68 company_id=None,
69 location=location,
70 job=JobInfo(
71 title=title,
72 description=description,
73 salary=Salary(
74 min_amount=None,
75 max_amount=None,
76 currency=None,
77 ),
78 ),
79 raw=soup.text,
80 )
82 def scrape_job(self) -> list[JobResult]:
83 """Scrape a single job listing from the given URL."""
85 job_data = []
86 for job_url in self.job_urls:
87 for i in range(50):
88 try:
89 job_data.append(self.scrape_job_listing(job_url))
90 break
91 except:
92 pass
93 else:
94 raise AssertionError("Failed to scrape job listing after multiple attempts.")
95 return job_data
98if __name__ == "__main__":
99 scraper = VeganJobsJobScraper("sharpen-strategy-remote-usa-operations-coordinator")
100 veganjob_data = scraper.scrape_job()
101 print(veganjob_data)