Coverage for backend / app / job_email_scraping / models.py: 99%

153 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-17 21:34 +0000

1"""Job Email Scraper Database Models 

2 

3Defines SQLAlchemy ORM models for email-based job scraping functionality. 

4Includes models for job alert emails, extracted job IDs, and scraped job data 

5with associated companies and locations from external sources.""" 

6 

7from sqlalchemy import ( 

8 Column, 

9 String, 

10 Boolean, 

11 ForeignKey, 

12 Integer, 

13 Float, 

14 TIMESTAMP, 

15 Table, 

16 UniqueConstraint, 

17 CheckConstraint, 

18) 

19from sqlalchemy.dialects.postgresql import JSONB, ARRAY as PG_ARRAY 

20from sqlalchemy.ext.hybrid import hybrid_property 

21from sqlalchemy.orm import relationship 

22from sqlalchemy.sql import expression 

23 

24from app.base_models import CommonBase, Owned 

25from app.database import Base 

26from app.service_runner.models import ServiceLog 

27 

28jobemail_scrapedjob_mapping = Table( 

29 "jobemail_scrapedjob_mapping", 

30 Base.metadata, 

31 Column("email_id", Integer, ForeignKey("job_email.id", ondelete="CASCADE"), primary_key=True), 

32 Column("job_id", Integer, ForeignKey("scraped_job.id", ondelete="CASCADE"), primary_key=True), 

33) 

34 

35 

36class JobEmail(Owned, Base): 

37 """Represents email messages containing job information like LinkedIn and Indeed job alerts 

38 

39 Attributes: 

40 ----------- 

41 - `external_email_id` (str): Unique identifier for the email message. 

42 - `subject` (str): Subject of the email message. 

43 - `sender` (str): Sender of the email message. 

44 - `date_received` (datetime): Date and time when the email was received. 

45 - 'job_found_n' (int): Number of jobs found in the email 

46 - `platform` (str): Platform from which the email was received (LinkedIn, Indeed, etc.). 

47 - `body` (str): Body of the email message. 

48 - `alert_name` (str, optional): Name of the job alert associated with the email. 

49 

50 Foreign keys: 

51 ------------- 

52 - `service_log_id` (int, optional): Identifier for the JobEmailScrapingServiceLog associated with the email. 

53 

54 Relationships: 

55 -------------- 

56 - `jobs` (list of ScrapedJob): List of scraped jobs associated with the email. 

57 - `service_log` (JobEmailScrapingServiceLog): JobEmailScrapingServiceLog object associated with the email. 

58 

59 Constraints: 

60 ------------ 

61 - Unique constraint on the combination of `external_email_id` and `owner_id` to ensure uniqueness per user.""" 

62 

63 external_email_id = Column(String, unique=True, nullable=False) 

64 subject = Column(String, nullable=False) 

65 sender = Column(String, nullable=False) 

66 date_received = Column(TIMESTAMP(timezone=True), nullable=False) 

67 job_found_n = Column(Integer, nullable=False, default=0) 

68 platform = Column(String, nullable=False) 

69 body = Column(String, nullable=False) 

70 alert_name = Column(String, nullable=True) 

71 

72 # Foreign keys 

73 service_log_id = Column( 

74 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="SET NULL"), nullable=False 

75 ) 

76 

77 # Relationships 

78 jobs = relationship("ScrapedJob", secondary=jobemail_scrapedjob_mapping, back_populates="emails") 

79 service_log = relationship("JobEmailScrapingServiceLog", back_populates="emails") 

80 

81 __table_args__ = (UniqueConstraint("external_email_id", "owner_id", name="unique_email_per_owner"),) 

82 

83 

84class ScrapedJob(Owned, Base): 

85 """Represents scraped job postings from external sources with additional metadata. 

86 

87 Attributes: 

88 ----------- 

89 - `external_job_id` (str): Unique identifier for the job posting. 

90 - `platform` (str): Platform from which the job was scraped (LinkedIn, Indeed, etc.). 

91 - `is_processed` (bool): Indicates whether the job has been processed (scraped, skipped, copied, etc.). 

92 - `is_scraped` (bool): Indicates whether the job has been successfully scraped. 

93 - `is_failed` (bool): Indicates whether the job scraping failed. 

94 - `is_skipped` (bool): Indicates whether the job scraping was skipped (e.g., quota exceeded). 

95 - `scrape_error` (list of dict): List of error entries, each with 'datetime' and 'error' keys, recorded on each failed scrape attempt. 

96 - `skip_reason` (str, optional): Reason why the job scraping was skipped. 

97 - `scrape_datetime` (datetime, optional): Date and time when the job was scraped. 

98 - `is_active` (bool): Indicates whether the job is active 

99 - `is_imported` (bool): Indicates whether the job was imported into a job. 

100 - `retry_count` (int): Number of times the job has been retried. 

101 - `next_retry_at` (datetime, optional): Date and time when the next retry is scheduled. 

102 

103 # Job data 

104 - `title` (str, optional): Title of the job. 

105 - `description` (str, optional): Description of the job. 

106 - `salary_min` (float, optional): Minimum salary of the job. 

107 - `salary_max` (float, optional): Maximum salary of the job. 

108 - `salary_currency` (str, optional): Salary currency 

109 - `url` (str, optional): URL to the job posting. 

110 - `raw_url` (str, optional): Raw URL to the job posting. 

111 - `deadline` (datetime, optional): Deadline for the job. 

112 - `company` (str, optional): Company name of the job. 

113 - `location_postcode` (str, optional): Postcode of the job location. 

114 - `location_city` (str, optional): City of the job location. 

115 - `location_country` (str, optional): Country of the job location. 

116 - `parsed_location` (str, optional): Parsed location of the job posting. 

117 - `attendance_type` (str, optional): Attendance type of the job (e.g., remote, on-site). 

118 - `is_closed` (bool): Indicates whether the job is closed. 

119 

120 Foreign keys: 

121 ------------- 

122 - `service_log_id` (int): Identifier for the JobEmailScrapingServiceLog associated with the job. 

123 

124 Relationships: 

125 -------------- 

126 - `emails` (list of JobEmail): List of email messages associated with the job. 

127 - `service_log` (JobEmailScrapingServiceLog): JobEmailScrapingServiceLog object associated with the job. 

128 - `job_rating` (JobRating): JobRating object associated with the job. 

129 

130 Constraints: 

131 ------------ 

132 - Unique constraint on the combination of `external_job_id` and `owner_id` to ensure uniqueness per user.""" 

133 

134 external_job_id = Column(String, nullable=False) 

135 platform = Column(String, nullable=False) 

136 is_processed = Column(Boolean, nullable=False, server_default=expression.false()) 

137 is_scraped = Column(Boolean, nullable=False, server_default=expression.false()) 

138 is_failed = Column(Boolean, nullable=False, server_default=expression.false()) 

139 scrape_error = Column(JSONB, server_default="[]", nullable=False) 

140 is_skipped = Column(Boolean, nullable=False, server_default=expression.false()) 

141 skip_reason = Column(String, nullable=True) 

142 scrape_datetime = Column(TIMESTAMP(timezone=True), nullable=True) 

143 is_active = Column(Boolean, nullable=False, server_default=expression.true()) 

144 is_imported = Column(Boolean, nullable=False, server_default=expression.false()) 

145 retry_count = Column(Integer, nullable=False, server_default="0") 

146 next_retry_at = Column(TIMESTAMP(timezone=True), nullable=True) 

147 

148 # Job data 

149 title = Column(String, nullable=True) 

150 description = Column(String, nullable=True) 

151 salary_min = Column(Float, nullable=True) 

152 salary_max = Column(Float, nullable=True) 

153 salary_currency = Column(String, nullable=True) 

154 url = Column(String, nullable=True) 

155 raw_url = Column(String, nullable=True) 

156 deadline = Column(TIMESTAMP(timezone=True), nullable=True) 

157 company = Column(String, nullable=True) 

158 location = Column(String, nullable=True) 

159 location_postcode = Column(String, nullable=True) 

160 location_city = Column(String, nullable=True) 

161 location_country = Column(String, nullable=True) 

162 parsed_location = Column(String, nullable=True) 

163 attendance_type = Column(String, nullable=True) 

164 is_closed = Column(Boolean, nullable=False, default=expression.false()) 

165 

166 # Foreign keys 

167 service_log_id = Column( 

168 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="SET NULL"), nullable=False 

169 ) 

170 exclusion_filter_id = Column( 

171 Integer, ForeignKey("scraping_exclusion_filter.id", ondelete="SET NULL"), nullable=True 

172 ) 

173 favourite_filter_id = Column( 

174 Integer, ForeignKey("scraping_favourite_filter.id", ondelete="SET NULL"), nullable=True 

175 ) 

176 geolocation_id = Column(Integer, ForeignKey("geolocation.id", ondelete="SET NULL"), nullable=True) 

177 

178 # Relationships 

179 emails = relationship("JobEmail", secondary=jobemail_scrapedjob_mapping, back_populates="jobs") 

180 service_log = relationship("JobEmailScrapingServiceLog", back_populates="scraped_jobs") 

181 job_rating = relationship("JobRating", back_populates="scraped_job", uselist=False) 

182 exclusion_filter = relationship("ScrapingExclusionFilter", back_populates="filtered_jobs") 

183 favourite_filter = relationship("ScrapingFavouriteFilter", back_populates="filtered_jobs") 

184 geolocation = relationship("Geolocation") 

185 

186 # Constraints 

187 __table_args__ = (UniqueConstraint("external_job_id", "owner_id", name="unique_job_per_owner"),) 

188 

189 

190class JobEmailScrapingServiceLog(ServiceLog, CommonBase, Base): 

191 """Represents logs of service operations and their status. 

192 

193 Attributes: 

194 ----------- 

195 - `user_found_ids` (list of int): List of user IDs found during the service run. 

196 - `user_processed_ids` (list of int): List of user IDs processed during the service run. 

197 - `email_found_n` (int): Number of emails found during the service run. 

198 

199 Relationships: 

200 -------------- 

201 - `emails` (list of JobEmail): List of email messages associated with the service log. 

202 - `scraped_jobs` (list of ScrapedJob): List of scraped jobs associated with the service log. 

203 - `platform_stats` (list of JobEmailScrapingPlatformStat): List of platform statistics associated with the service log. 

204 - `errors` (list of JobEmailScrapingServiceError): List of errors associated with the service log. 

205 

206 Properties: 

207 ----------- 

208 - `job_scrape_succeeded_n` (int): Total successfully scraped jobs across all platforms. 

209 - `job_scrape_failed_n` (int): Total failed scraped jobs across all platforms. 

210 - `job_scrape_copied_n` (int): Total copied scraped jobs found across all platforms. 

211 - `job_scrape_skipped_n` (int): Total skipped scraped jobs found across all platforms. 

212 - `job_found_n` (int): Total jobs found (copied + skipped) across all platforms. 

213 - `email_saved_n` (int): Total emails saved across all platforms. 

214 - `email_skipped_n` (int): Total emails skipped across all platforms.""" 

215 

216 # Users 

217 user_found_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

218 user_processed_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

219 

220 # Emails 

221 email_found_n = Column(Integer, nullable=False, default=0) 

222 

223 # Relationships 

224 emails = relationship("JobEmail", back_populates="service_log") 

225 scraped_jobs = relationship("ScrapedJob", back_populates="service_log") 

226 platform_stats = relationship("JobEmailScrapingPlatformStat", back_populates="service_log") 

227 service_errors = relationship("JobEmailScrapingServiceError", back_populates="service_log") 

228 

229 def __init__(self, **kwargs) -> None: 

230 """Initialise array fields with empty lists if not provided""" 

231 kwargs.setdefault("user_found_ids", []) 

232 kwargs.setdefault("user_processed_ids", []) 

233 super().__init__(**kwargs) 

234 

235 @hybrid_property 

236 def job_to_process_n(self) -> int: 

237 """Total jobs to scrape across all platforms.""" 

238 return sum(len(stat.job_to_process_ids) for stat in self.platform_stats) 

239 

240 @hybrid_property 

241 def job_scrape_succeeded_n(self) -> int: 

242 """Total successfully scraped jobs across all platforms.""" 

243 return sum(len(stat.job_scrape_succeeded_ids) for stat in self.platform_stats) 

244 

245 @hybrid_property 

246 def job_scrape_failed_n(self) -> int: 

247 """Total failed scraped jobs across all platforms.""" 

248 return sum(len(stat.job_scrape_failed_ids) for stat in self.platform_stats) 

249 

250 @hybrid_property 

251 def job_scrape_copied_n(self) -> int: 

252 """Total copied scraped jobs found across all platforms.""" 

253 return sum(len(stat.job_scrape_copied_ids) for stat in self.platform_stats) 

254 

255 @hybrid_property 

256 def job_scrape_skipped_n(self) -> int: 

257 """Total skipped scraped jobs found across all platforms.""" 

258 return sum(len(stat.job_scrape_skipped_ids) for stat in self.platform_stats) 

259 

260 @hybrid_property 

261 def job_found_n(self) -> int: 

262 """Total jobs copied/skipped across all platforms.""" 

263 return sum(len(stat.job_found_ids) for stat in self.platform_stats) 

264 

265 @hybrid_property 

266 def email_saved_n(self) -> int: 

267 """Total emails saved across all platforms.""" 

268 return sum(len(stat.email_saved_ids) for stat in self.platform_stats) 

269 

270 @hybrid_property 

271 def email_skipped_n(self) -> int: 

272 """Total emails saved across all platforms.""" 

273 return sum(len(stat.email_skipped_ids) for stat in self.platform_stats) 

274 

275 @hybrid_property 

276 def job_scrape_filtered_n(self) -> int: 

277 """Total filtered scraped jobs across all platforms.""" 

278 return sum(len(stat.job_scrape_filtered_ids) for stat in self.platform_stats) 

279 

280 

281class JobEmailScrapingPlatformStat(CommonBase, Base): 

282 """Per-platform stats for a service run linked to an JobEmailScrapingServiceLog. 

283 

284 Attributes: 

285 ----------- 

286 - `name` (str): Platform name (e.g. LinkedIn, Indeed). 

287 

288 # Emails 

289 - `email_saved_ids` (list of int): List of saved email IDs. 

290 - `email_skipped_ids` (list of int): List of skipped email IDs. 

291 

292 # Jobs 

293 - `job_found_ids` (list of int): List of found job IDs from the emails. 

294 - `job_to_process_ids` (list of int): List of job IDs to be processed. 

295 - `job_scrape_failed_ids` (list of int): List of failed job scrape IDs. 

296 - `job_scrape_succeeded_ids` (list of int): List of successful job scrape IDs. 

297 - `job_scrape_copied_ids` (list of int): List of copied job scrape IDs. 

298 - `job_scrape_skipped_ids` (list of int): List of skipped job scrape IDs. 

299 - `job_scrape_filtered_ids` (list of int): List of filtered job scrape IDs. 

300 

301 Foreign keys: 

302 ------------- 

303 - `service_log_id` (int): Foreign key to the associated JobEmailScrapingServiceLog. 

304 

305 Relationships: 

306 -------------- 

307 - `service_log` (JobEmailScrapingServiceLog): Relationship to the associated JobEmailScrapingServiceLog. 

308 

309 Constraints: 

310 ------------ 

311 - Unique constraint on the combination of `service_log_id` and `name` to ensure uniqueness per platform per service log. 

312 """ 

313 

314 name = Column(String, nullable=False) 

315 

316 # Emails 

317 email_saved_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

318 email_skipped_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

319 

320 # Jobs 

321 job_found_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

322 job_to_process_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

323 job_scrape_failed_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

324 job_scrape_succeeded_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

325 job_scrape_copied_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

326 job_scrape_skipped_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

327 job_scrape_filtered_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False) 

328 

329 # Foreign keys 

330 service_log_id = Column( 

331 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="CASCADE"), nullable=False 

332 ) 

333 

334 # Relationships 

335 service_log = relationship("JobEmailScrapingServiceLog", back_populates="platform_stats") 

336 

337 # Constraints 

338 __table_args__ = (UniqueConstraint("service_log_id", "name", name="unique_platform_per_service_log"),) 

339 

340 def __init__(self, **kwargs) -> None: 

341 """Initialise array fields with empty lists if not provided""" 

342 kwargs.setdefault("email_saved_ids", []) 

343 kwargs.setdefault("email_skipped_ids", []) 

344 kwargs.setdefault("job_found_ids", []) 

345 kwargs.setdefault("job_to_process_ids", []) 

346 kwargs.setdefault("job_scrape_failed_ids", []) 

347 kwargs.setdefault("job_scrape_succeeded_ids", []) 

348 kwargs.setdefault("job_scrape_copied_ids", []) 

349 kwargs.setdefault("job_scrape_skipped_ids", []) 

350 kwargs.setdefault("job_scrape_filtered_ids", []) 

351 super().__init__(**kwargs) 

352 

353 

354class JobEmailScrapingServiceError(CommonBase, Base): 

355 """Records unexpected/unhandled errors raised during a service run. 

356 

357 Attributes: 

358 ----------- 

359 - `error_type` (str): Type of the error. 

360 - `message` (str, optional): Error message. 

361 - `traceback` (str, optional): Traceback of the error. 

362 

363 Foreign keys: 

364 ------------- 

365 - `service_log_id` (int): Foreign key to the associated JobEmailScrapingServiceLog. 

366 

367 Relationships: 

368 -------------- 

369 - `service_log` (JobEmailScrapingServiceLog): Relationship to the associated JobEmailScrapingServiceLog""" 

370 

371 error_type = Column(String, nullable=False) 

372 message = Column(String, nullable=False) 

373 traceback = Column(String, nullable=False) 

374 

375 # Foreign keys 

376 service_log_id = Column( 

377 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="CASCADE"), nullable=False 

378 ) 

379 

380 # Relationships 

381 service_log = relationship("JobEmailScrapingServiceLog", back_populates="service_errors") 

382 

383 

384class Filter(object): 

385 """Represents user-defined rules to filter scraped jobs 

386 

387 Attributes: 

388 ----------- 

389 - `type` (str): Type of filter (title, company, location, salary, attendance_type). 

390 - `operator` (str): Operator for the filter (contains, equals, starts_with, ends_with, less_than, greater_than). 

391 - `value` (str): Value to match against. 

392 - `case_sensitive` (bool): Whether string matching should be case-sensitive. 

393 - `is_active` (bool): Whether the filter is active. 

394 

395 Constraints: 

396 ------------ 

397 - Check constraint to ensure valid filter_type values. 

398 - Check constraint to ensure valid filter_operator values.""" 

399 

400 type = Column(String, nullable=False) 

401 operator = Column(String, nullable=False) 

402 value = Column(String, nullable=False) 

403 case_sensitive = Column(Boolean, nullable=False, server_default=expression.false()) 

404 is_active = Column(Boolean, nullable=False, server_default=expression.true()) 

405 

406 # Constraints 

407 __table_args__ = ( 

408 CheckConstraint( 

409 "type IN ('title', 'company', 'location', 'location_city', 'location_country', 'salary_min', 'salary_max', 'attendance_type')", 

410 name="valid_filter_type", 

411 ), 

412 CheckConstraint( 

413 "operator IN ('contains', 'equals', 'starts_with', 'ends_with', 'less_than', 'greater_than', 'not_contains', 'not_equals')", 

414 name="valid_filter_operator", 

415 ), 

416 ) 

417 

418 @hybrid_property 

419 def name(self) -> str: 

420 """Generate a human-readable name for the filter rule.""" 

421 

422 case_str = " (case-sensitive)" if self.case_sensitive else "" 

423 return f"{self.type} {self.operator} '{self.value}'{case_str}" 

424 

425 

426class ScrapingExclusionFilter(Filter, Owned, Base): 

427 """Represents user-defined rules to filter out scraped jobs.""" 

428 

429 filtered_jobs = relationship("ScrapedJob", back_populates="exclusion_filter") 

430 

431 

432class ScrapingFavouriteFilter(Filter, Owned, Base): 

433 """Represents user-defined rules to filter favourite scraped jobs.""" 

434 

435 filtered_jobs = relationship("ScrapedJob", back_populates="favourite_filter") 

436 

437 

438class ForwardingConfirmationLink(Owned, Base): 

439 """Represents a confirmation email sent to a user after setting up forwarding. 

440 

441 Attributes: 

442 ----------- 

443 - `email_external_id` (str): External ID of the email. 

444 - `url` (str): URL to the confirmation page. 

445 - `platform` (str): Platform (e.g., Gmail). 

446 - `is_used` (bool): Whether the confirmation link has been used""" 

447 

448 email_external_id = Column(String, nullable=False) 

449 url = Column(String, nullable=False) 

450 platform = Column(String, nullable=False) 

451 is_used = Column(Boolean, nullable=False, server_default=expression.false())