Coverage for backend / app / job_email_scraping / models.py: 99%
153 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
1"""Job Email Scraper Database Models
3Defines SQLAlchemy ORM models for email-based job scraping functionality.
4Includes models for job alert emails, extracted job IDs, and scraped job data
5with associated companies and locations from external sources."""
7from sqlalchemy import (
8 Column,
9 String,
10 Boolean,
11 ForeignKey,
12 Integer,
13 Float,
14 TIMESTAMP,
15 Table,
16 UniqueConstraint,
17 CheckConstraint,
18)
19from sqlalchemy.dialects.postgresql import JSONB, ARRAY as PG_ARRAY
20from sqlalchemy.ext.hybrid import hybrid_property
21from sqlalchemy.orm import relationship
22from sqlalchemy.sql import expression
24from app.base_models import CommonBase, Owned
25from app.database import Base
26from app.service_runner.models import ServiceLog
28jobemail_scrapedjob_mapping = Table(
29 "jobemail_scrapedjob_mapping",
30 Base.metadata,
31 Column("email_id", Integer, ForeignKey("job_email.id", ondelete="CASCADE"), primary_key=True),
32 Column("job_id", Integer, ForeignKey("scraped_job.id", ondelete="CASCADE"), primary_key=True),
33)
36class JobEmail(Owned, Base):
37 """Represents email messages containing job information like LinkedIn and Indeed job alerts
39 Attributes:
40 -----------
41 - `external_email_id` (str): Unique identifier for the email message.
42 - `subject` (str): Subject of the email message.
43 - `sender` (str): Sender of the email message.
44 - `date_received` (datetime): Date and time when the email was received.
45 - 'job_found_n' (int): Number of jobs found in the email
46 - `platform` (str): Platform from which the email was received (LinkedIn, Indeed, etc.).
47 - `body` (str): Body of the email message.
48 - `alert_name` (str, optional): Name of the job alert associated with the email.
50 Foreign keys:
51 -------------
52 - `service_log_id` (int, optional): Identifier for the JobEmailScrapingServiceLog associated with the email.
54 Relationships:
55 --------------
56 - `jobs` (list of ScrapedJob): List of scraped jobs associated with the email.
57 - `service_log` (JobEmailScrapingServiceLog): JobEmailScrapingServiceLog object associated with the email.
59 Constraints:
60 ------------
61 - Unique constraint on the combination of `external_email_id` and `owner_id` to ensure uniqueness per user."""
63 external_email_id = Column(String, unique=True, nullable=False)
64 subject = Column(String, nullable=False)
65 sender = Column(String, nullable=False)
66 date_received = Column(TIMESTAMP(timezone=True), nullable=False)
67 job_found_n = Column(Integer, nullable=False, default=0)
68 platform = Column(String, nullable=False)
69 body = Column(String, nullable=False)
70 alert_name = Column(String, nullable=True)
72 # Foreign keys
73 service_log_id = Column(
74 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="SET NULL"), nullable=False
75 )
77 # Relationships
78 jobs = relationship("ScrapedJob", secondary=jobemail_scrapedjob_mapping, back_populates="emails")
79 service_log = relationship("JobEmailScrapingServiceLog", back_populates="emails")
81 __table_args__ = (UniqueConstraint("external_email_id", "owner_id", name="unique_email_per_owner"),)
84class ScrapedJob(Owned, Base):
85 """Represents scraped job postings from external sources with additional metadata.
87 Attributes:
88 -----------
89 - `external_job_id` (str): Unique identifier for the job posting.
90 - `platform` (str): Platform from which the job was scraped (LinkedIn, Indeed, etc.).
91 - `is_processed` (bool): Indicates whether the job has been processed (scraped, skipped, copied, etc.).
92 - `is_scraped` (bool): Indicates whether the job has been successfully scraped.
93 - `is_failed` (bool): Indicates whether the job scraping failed.
94 - `is_skipped` (bool): Indicates whether the job scraping was skipped (e.g., quota exceeded).
95 - `scrape_error` (list of dict): List of error entries, each with 'datetime' and 'error' keys, recorded on each failed scrape attempt.
96 - `skip_reason` (str, optional): Reason why the job scraping was skipped.
97 - `scrape_datetime` (datetime, optional): Date and time when the job was scraped.
98 - `is_active` (bool): Indicates whether the job is active
99 - `is_imported` (bool): Indicates whether the job was imported into a job.
100 - `retry_count` (int): Number of times the job has been retried.
101 - `next_retry_at` (datetime, optional): Date and time when the next retry is scheduled.
103 # Job data
104 - `title` (str, optional): Title of the job.
105 - `description` (str, optional): Description of the job.
106 - `salary_min` (float, optional): Minimum salary of the job.
107 - `salary_max` (float, optional): Maximum salary of the job.
108 - `salary_currency` (str, optional): Salary currency
109 - `url` (str, optional): URL to the job posting.
110 - `raw_url` (str, optional): Raw URL to the job posting.
111 - `deadline` (datetime, optional): Deadline for the job.
112 - `company` (str, optional): Company name of the job.
113 - `location_postcode` (str, optional): Postcode of the job location.
114 - `location_city` (str, optional): City of the job location.
115 - `location_country` (str, optional): Country of the job location.
116 - `parsed_location` (str, optional): Parsed location of the job posting.
117 - `attendance_type` (str, optional): Attendance type of the job (e.g., remote, on-site).
118 - `is_closed` (bool): Indicates whether the job is closed.
120 Foreign keys:
121 -------------
122 - `service_log_id` (int): Identifier for the JobEmailScrapingServiceLog associated with the job.
124 Relationships:
125 --------------
126 - `emails` (list of JobEmail): List of email messages associated with the job.
127 - `service_log` (JobEmailScrapingServiceLog): JobEmailScrapingServiceLog object associated with the job.
128 - `job_rating` (JobRating): JobRating object associated with the job.
130 Constraints:
131 ------------
132 - Unique constraint on the combination of `external_job_id` and `owner_id` to ensure uniqueness per user."""
134 external_job_id = Column(String, nullable=False)
135 platform = Column(String, nullable=False)
136 is_processed = Column(Boolean, nullable=False, server_default=expression.false())
137 is_scraped = Column(Boolean, nullable=False, server_default=expression.false())
138 is_failed = Column(Boolean, nullable=False, server_default=expression.false())
139 scrape_error = Column(JSONB, server_default="[]", nullable=False)
140 is_skipped = Column(Boolean, nullable=False, server_default=expression.false())
141 skip_reason = Column(String, nullable=True)
142 scrape_datetime = Column(TIMESTAMP(timezone=True), nullable=True)
143 is_active = Column(Boolean, nullable=False, server_default=expression.true())
144 is_imported = Column(Boolean, nullable=False, server_default=expression.false())
145 retry_count = Column(Integer, nullable=False, server_default="0")
146 next_retry_at = Column(TIMESTAMP(timezone=True), nullable=True)
148 # Job data
149 title = Column(String, nullable=True)
150 description = Column(String, nullable=True)
151 salary_min = Column(Float, nullable=True)
152 salary_max = Column(Float, nullable=True)
153 salary_currency = Column(String, nullable=True)
154 url = Column(String, nullable=True)
155 raw_url = Column(String, nullable=True)
156 deadline = Column(TIMESTAMP(timezone=True), nullable=True)
157 company = Column(String, nullable=True)
158 location = Column(String, nullable=True)
159 location_postcode = Column(String, nullable=True)
160 location_city = Column(String, nullable=True)
161 location_country = Column(String, nullable=True)
162 parsed_location = Column(String, nullable=True)
163 attendance_type = Column(String, nullable=True)
164 is_closed = Column(Boolean, nullable=False, default=expression.false())
166 # Foreign keys
167 service_log_id = Column(
168 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="SET NULL"), nullable=False
169 )
170 exclusion_filter_id = Column(
171 Integer, ForeignKey("scraping_exclusion_filter.id", ondelete="SET NULL"), nullable=True
172 )
173 favourite_filter_id = Column(
174 Integer, ForeignKey("scraping_favourite_filter.id", ondelete="SET NULL"), nullable=True
175 )
176 geolocation_id = Column(Integer, ForeignKey("geolocation.id", ondelete="SET NULL"), nullable=True)
178 # Relationships
179 emails = relationship("JobEmail", secondary=jobemail_scrapedjob_mapping, back_populates="jobs")
180 service_log = relationship("JobEmailScrapingServiceLog", back_populates="scraped_jobs")
181 job_rating = relationship("JobRating", back_populates="scraped_job", uselist=False)
182 exclusion_filter = relationship("ScrapingExclusionFilter", back_populates="filtered_jobs")
183 favourite_filter = relationship("ScrapingFavouriteFilter", back_populates="filtered_jobs")
184 geolocation = relationship("Geolocation")
186 # Constraints
187 __table_args__ = (UniqueConstraint("external_job_id", "owner_id", name="unique_job_per_owner"),)
190class JobEmailScrapingServiceLog(ServiceLog, CommonBase, Base):
191 """Represents logs of service operations and their status.
193 Attributes:
194 -----------
195 - `user_found_ids` (list of int): List of user IDs found during the service run.
196 - `user_processed_ids` (list of int): List of user IDs processed during the service run.
197 - `email_found_n` (int): Number of emails found during the service run.
199 Relationships:
200 --------------
201 - `emails` (list of JobEmail): List of email messages associated with the service log.
202 - `scraped_jobs` (list of ScrapedJob): List of scraped jobs associated with the service log.
203 - `platform_stats` (list of JobEmailScrapingPlatformStat): List of platform statistics associated with the service log.
204 - `errors` (list of JobEmailScrapingServiceError): List of errors associated with the service log.
206 Properties:
207 -----------
208 - `job_scrape_succeeded_n` (int): Total successfully scraped jobs across all platforms.
209 - `job_scrape_failed_n` (int): Total failed scraped jobs across all platforms.
210 - `job_scrape_copied_n` (int): Total copied scraped jobs found across all platforms.
211 - `job_scrape_skipped_n` (int): Total skipped scraped jobs found across all platforms.
212 - `job_found_n` (int): Total jobs found (copied + skipped) across all platforms.
213 - `email_saved_n` (int): Total emails saved across all platforms.
214 - `email_skipped_n` (int): Total emails skipped across all platforms."""
216 # Users
217 user_found_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
218 user_processed_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
220 # Emails
221 email_found_n = Column(Integer, nullable=False, default=0)
223 # Relationships
224 emails = relationship("JobEmail", back_populates="service_log")
225 scraped_jobs = relationship("ScrapedJob", back_populates="service_log")
226 platform_stats = relationship("JobEmailScrapingPlatformStat", back_populates="service_log")
227 service_errors = relationship("JobEmailScrapingServiceError", back_populates="service_log")
229 def __init__(self, **kwargs) -> None:
230 """Initialise array fields with empty lists if not provided"""
231 kwargs.setdefault("user_found_ids", [])
232 kwargs.setdefault("user_processed_ids", [])
233 super().__init__(**kwargs)
235 @hybrid_property
236 def job_to_process_n(self) -> int:
237 """Total jobs to scrape across all platforms."""
238 return sum(len(stat.job_to_process_ids) for stat in self.platform_stats)
240 @hybrid_property
241 def job_scrape_succeeded_n(self) -> int:
242 """Total successfully scraped jobs across all platforms."""
243 return sum(len(stat.job_scrape_succeeded_ids) for stat in self.platform_stats)
245 @hybrid_property
246 def job_scrape_failed_n(self) -> int:
247 """Total failed scraped jobs across all platforms."""
248 return sum(len(stat.job_scrape_failed_ids) for stat in self.platform_stats)
250 @hybrid_property
251 def job_scrape_copied_n(self) -> int:
252 """Total copied scraped jobs found across all platforms."""
253 return sum(len(stat.job_scrape_copied_ids) for stat in self.platform_stats)
255 @hybrid_property
256 def job_scrape_skipped_n(self) -> int:
257 """Total skipped scraped jobs found across all platforms."""
258 return sum(len(stat.job_scrape_skipped_ids) for stat in self.platform_stats)
260 @hybrid_property
261 def job_found_n(self) -> int:
262 """Total jobs copied/skipped across all platforms."""
263 return sum(len(stat.job_found_ids) for stat in self.platform_stats)
265 @hybrid_property
266 def email_saved_n(self) -> int:
267 """Total emails saved across all platforms."""
268 return sum(len(stat.email_saved_ids) for stat in self.platform_stats)
270 @hybrid_property
271 def email_skipped_n(self) -> int:
272 """Total emails saved across all platforms."""
273 return sum(len(stat.email_skipped_ids) for stat in self.platform_stats)
275 @hybrid_property
276 def job_scrape_filtered_n(self) -> int:
277 """Total filtered scraped jobs across all platforms."""
278 return sum(len(stat.job_scrape_filtered_ids) for stat in self.platform_stats)
281class JobEmailScrapingPlatformStat(CommonBase, Base):
282 """Per-platform stats for a service run linked to an JobEmailScrapingServiceLog.
284 Attributes:
285 -----------
286 - `name` (str): Platform name (e.g. LinkedIn, Indeed).
288 # Emails
289 - `email_saved_ids` (list of int): List of saved email IDs.
290 - `email_skipped_ids` (list of int): List of skipped email IDs.
292 # Jobs
293 - `job_found_ids` (list of int): List of found job IDs from the emails.
294 - `job_to_process_ids` (list of int): List of job IDs to be processed.
295 - `job_scrape_failed_ids` (list of int): List of failed job scrape IDs.
296 - `job_scrape_succeeded_ids` (list of int): List of successful job scrape IDs.
297 - `job_scrape_copied_ids` (list of int): List of copied job scrape IDs.
298 - `job_scrape_skipped_ids` (list of int): List of skipped job scrape IDs.
299 - `job_scrape_filtered_ids` (list of int): List of filtered job scrape IDs.
301 Foreign keys:
302 -------------
303 - `service_log_id` (int): Foreign key to the associated JobEmailScrapingServiceLog.
305 Relationships:
306 --------------
307 - `service_log` (JobEmailScrapingServiceLog): Relationship to the associated JobEmailScrapingServiceLog.
309 Constraints:
310 ------------
311 - Unique constraint on the combination of `service_log_id` and `name` to ensure uniqueness per platform per service log.
312 """
314 name = Column(String, nullable=False)
316 # Emails
317 email_saved_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
318 email_skipped_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
320 # Jobs
321 job_found_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
322 job_to_process_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
323 job_scrape_failed_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
324 job_scrape_succeeded_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
325 job_scrape_copied_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
326 job_scrape_skipped_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
327 job_scrape_filtered_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)
329 # Foreign keys
330 service_log_id = Column(
331 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="CASCADE"), nullable=False
332 )
334 # Relationships
335 service_log = relationship("JobEmailScrapingServiceLog", back_populates="platform_stats")
337 # Constraints
338 __table_args__ = (UniqueConstraint("service_log_id", "name", name="unique_platform_per_service_log"),)
340 def __init__(self, **kwargs) -> None:
341 """Initialise array fields with empty lists if not provided"""
342 kwargs.setdefault("email_saved_ids", [])
343 kwargs.setdefault("email_skipped_ids", [])
344 kwargs.setdefault("job_found_ids", [])
345 kwargs.setdefault("job_to_process_ids", [])
346 kwargs.setdefault("job_scrape_failed_ids", [])
347 kwargs.setdefault("job_scrape_succeeded_ids", [])
348 kwargs.setdefault("job_scrape_copied_ids", [])
349 kwargs.setdefault("job_scrape_skipped_ids", [])
350 kwargs.setdefault("job_scrape_filtered_ids", [])
351 super().__init__(**kwargs)
354class JobEmailScrapingServiceError(CommonBase, Base):
355 """Records unexpected/unhandled errors raised during a service run.
357 Attributes:
358 -----------
359 - `error_type` (str): Type of the error.
360 - `message` (str, optional): Error message.
361 - `traceback` (str, optional): Traceback of the error.
363 Foreign keys:
364 -------------
365 - `service_log_id` (int): Foreign key to the associated JobEmailScrapingServiceLog.
367 Relationships:
368 --------------
369 - `service_log` (JobEmailScrapingServiceLog): Relationship to the associated JobEmailScrapingServiceLog"""
371 error_type = Column(String, nullable=False)
372 message = Column(String, nullable=False)
373 traceback = Column(String, nullable=False)
375 # Foreign keys
376 service_log_id = Column(
377 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="CASCADE"), nullable=False
378 )
380 # Relationships
381 service_log = relationship("JobEmailScrapingServiceLog", back_populates="service_errors")
384class Filter(object):
385 """Represents user-defined rules to filter scraped jobs
387 Attributes:
388 -----------
389 - `type` (str): Type of filter (title, company, location, salary, attendance_type).
390 - `operator` (str): Operator for the filter (contains, equals, starts_with, ends_with, less_than, greater_than).
391 - `value` (str): Value to match against.
392 - `case_sensitive` (bool): Whether string matching should be case-sensitive.
393 - `is_active` (bool): Whether the filter is active.
395 Constraints:
396 ------------
397 - Check constraint to ensure valid filter_type values.
398 - Check constraint to ensure valid filter_operator values."""
400 type = Column(String, nullable=False)
401 operator = Column(String, nullable=False)
402 value = Column(String, nullable=False)
403 case_sensitive = Column(Boolean, nullable=False, server_default=expression.false())
404 is_active = Column(Boolean, nullable=False, server_default=expression.true())
406 # Constraints
407 __table_args__ = (
408 CheckConstraint(
409 "type IN ('title', 'company', 'location', 'location_city', 'location_country', 'salary_min', 'salary_max', 'attendance_type')",
410 name="valid_filter_type",
411 ),
412 CheckConstraint(
413 "operator IN ('contains', 'equals', 'starts_with', 'ends_with', 'less_than', 'greater_than', 'not_contains', 'not_equals')",
414 name="valid_filter_operator",
415 ),
416 )
418 @hybrid_property
419 def name(self) -> str:
420 """Generate a human-readable name for the filter rule."""
422 case_str = " (case-sensitive)" if self.case_sensitive else ""
423 return f"{self.type} {self.operator} '{self.value}'{case_str}"
426class ScrapingExclusionFilter(Filter, Owned, Base):
427 """Represents user-defined rules to filter out scraped jobs."""
429 filtered_jobs = relationship("ScrapedJob", back_populates="exclusion_filter")
432class ScrapingFavouriteFilter(Filter, Owned, Base):
433 """Represents user-defined rules to filter favourite scraped jobs."""
435 filtered_jobs = relationship("ScrapedJob", back_populates="favourite_filter")
438class ForwardingConfirmationLink(Owned, Base):
439 """Represents a confirmation email sent to a user after setting up forwarding.
441 Attributes:
442 -----------
443 - `email_external_id` (str): External ID of the email.
444 - `url` (str): URL to the confirmation page.
445 - `platform` (str): Platform (e.g., Gmail).
446 - `is_used` (bool): Whether the confirmation link has been used"""
448 email_external_id = Column(String, nullable=False)
449 url = Column(String, nullable=False)
450 platform = Column(String, nullable=False)
451 is_used = Column(Boolean, nullable=False, server_default=expression.false())