Coverage for backend/app/job_email_scraping/job

1"""veganjobs.com job scraper module"""

3import re

5import cloudscraper

6from bs4 import BeautifulSoup

8from app.job_email_scraping.schemas import Salary, JobInfo, JobResult

11class VeganJobsJobScraper:

12 """Scraper for veganjobs.com job listings."""

14 base_url = "https://veganjobs.com/job/"

16 def __init__(self, job_ids: str | list[str]) -> None:

17 """Initialize the scraper with headers and delay settings.

18 :param job_ids: The job ID(s)"""

20 self.scraper = cloudscraper.create_scraper()

21 self.job_ids = [job_ids] if isinstance(job_ids, str) else job_ids

22 self.job_urls = [f"{self.base_url}{job_id}" for job_id in self.job_ids]

24 def scrape_job_listing(self, job_url: str) -> JobResult:

25 """Scrape job data from a specific veganjobs.com job listing URL

26 :param job_url: The URL of the job listing to scrape"""

28 response = self.scraper.get(job_url)

29 response.raise_for_status()

31 soup = BeautifulSoup(response.content, "html.parser")

33 # Defaults

34 company = None

35 title = None

36 location = None

38 # Title

39 title_tag = soup.find("h2", class_="page-title")

40 if title_tag:

41 title = title_tag.get_text(strip=True)

43 # Company

44 company_tag = soup.find("div", class_="joblisting-meta-company-name")

45 if company_tag:

46 company = company_tag.get_text(strip=True)

48 # Location

49 location_tag = soup.find("li", class_="location")

50 if location_tag:

51 location = location_tag.get_text(strip=True)

53 # Full text block

54 container = soup.find("div", class_="job_listing-description")

55 # noinspection PyArgumentList

56 text_content = container.get_text(separator="\n", strip=True) if container else ""

58 # Salary

59 # salary_match = re.search(r"Salary:\s*(.+)", text_content)

60 # salary_raw = salary_match.group(1).split("\n")[0] if salary_match else None

62 # Description (remove salary)

63 description = re.sub(r"Salary:.*", "", text_content, flags=re.DOTALL).strip()

64 description = description.strip("Overwiew").strip()

66 return JobResult(

67 company=company,

68 company_id=None,

69 location=location,

70 job=JobInfo(

71 title=title,

72 description=description,

73 salary=Salary(

74 min_amount=None,

75 max_amount=None,

76 currency=None,

77 ),

78 ),

79 raw=soup.text,

80 )

82 def scrape_job(self) -> list[JobResult]:

83 """Scrape a single job listing from the given URL."""

85 job_data = []

86 for job_url in self.job_urls:

87 for i in range(50):

88 try:

89 job_data.append(self.scrape_job_listing(job_url))

90 break

91 except:

92 pass

93 else:

94 raise AssertionError("Failed to scrape job listing after multiple attempts.")

95 return job_data

98if __name__ == "__main__":

99 scraper = VeganJobsJobScraper("sharpen-strategy-remote-usa-operations-coordinator")

100 veganjob_data = scraper.scrape_job()

101 print(veganjob_data)

Coverage for backend / app / job_email_scraping / job_scrapers / veganjobs.py: 22%

46 statements