Coverage for backend/app/job_email_scraping/models.py: 99%

1"""Job Email Scraper Database Models

3Defines SQLAlchemy ORM models for email-based job scraping functionality.

4Includes models for job alert emails, extracted job IDs, and scraped job data

5with associated companies and locations from external sources."""

7from sqlalchemy import (

8 Column,

9 String,

10 Boolean,

11 ForeignKey,

12 Integer,

13 Float,

14 TIMESTAMP,

15 Table,

16 UniqueConstraint,

17 CheckConstraint,

18)

19from sqlalchemy.dialects.postgresql import JSONB, ARRAY as PG_ARRAY

20from sqlalchemy.ext.hybrid import hybrid_property

21from sqlalchemy.orm import relationship

22from sqlalchemy.sql import expression

24from app.base_models import CommonBase, Owned

25from app.database import Base

26from app.service_runner.models import ServiceLog

28jobemail_scrapedjob_mapping = Table(

29 "jobemail_scrapedjob_mapping",

30 Base.metadata,

31 Column("email_id", Integer, ForeignKey("job_email.id", ondelete="CASCADE"), primary_key=True),

32 Column("job_id", Integer, ForeignKey("scraped_job.id", ondelete="CASCADE"), primary_key=True),

33)

36class JobEmail(Owned, Base):

37 """Represents email messages containing job information like LinkedIn and Indeed job alerts

39 Attributes:

40 -----------

41 - `external_email_id` (str): Unique identifier for the email message.

42 - `subject` (str): Subject of the email message.

43 - `sender` (str): Sender of the email message.

44 - `date_received` (datetime): Date and time when the email was received.

45 - 'job_found_n' (int): Number of jobs found in the email

46 - `platform` (str): Platform from which the email was received (LinkedIn, Indeed, etc.).

47 - `body` (str): Body of the email message.

48 - `alert_name` (str, optional): Name of the job alert associated with the email.

50 Foreign keys:

51 -------------

52 - `service_log_id` (int, optional): Identifier for the JobEmailScrapingServiceLog associated with the email.

54 Relationships:

55 --------------

56 - `jobs` (list of ScrapedJob): List of scraped jobs associated with the email.

57 - `service_log` (JobEmailScrapingServiceLog): JobEmailScrapingServiceLog object associated with the email.

59 Constraints:

60 ------------

61 - Unique constraint on the combination of `external_email_id` and `owner_id` to ensure uniqueness per user."""

63 external_email_id = Column(String, unique=True, nullable=False)

64 subject = Column(String, nullable=False)

65 sender = Column(String, nullable=False)

66 date_received = Column(TIMESTAMP(timezone=True), nullable=False)

67 job_found_n = Column(Integer, nullable=False, default=0)

68 platform = Column(String, nullable=False)

69 body = Column(String, nullable=False)

70 alert_name = Column(String, nullable=True)

72 # Foreign keys

73 service_log_id = Column(

74 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="SET NULL"), nullable=False

75 )

77 # Relationships

78 jobs = relationship("ScrapedJob", secondary=jobemail_scrapedjob_mapping, back_populates="emails")

79 service_log = relationship("JobEmailScrapingServiceLog", back_populates="emails")

81 __table_args__ = (UniqueConstraint("external_email_id", "owner_id", name="unique_email_per_owner"),)

84class ScrapedJob(Owned, Base):

85 """Represents scraped job postings from external sources with additional metadata.

87 Attributes:

88 -----------

89 - `external_job_id` (str): Unique identifier for the job posting.

90 - `platform` (str): Platform from which the job was scraped (LinkedIn, Indeed, etc.).

91 - `is_processed` (bool): Indicates whether the job has been processed (scraped, skipped, copied, etc.).

92 - `is_scraped` (bool): Indicates whether the job has been successfully scraped.

93 - `is_failed` (bool): Indicates whether the job scraping failed.

94 - `is_skipped` (bool): Indicates whether the job scraping was skipped (e.g., quota exceeded).

95 - `scrape_error` (list of dict): List of error entries, each with 'datetime' and 'error' keys, recorded on each failed scrape attempt.

96 - `skip_reason` (str, optional): Reason why the job scraping was skipped.

97 - `scrape_datetime` (datetime, optional): Date and time when the job was scraped.

98 - `is_active` (bool): Indicates whether the job is active

99 - `is_imported` (bool): Indicates whether the job was imported into a job.

100 - `retry_count` (int): Number of times the job has been retried.

101 - `next_retry_at` (datetime, optional): Date and time when the next retry is scheduled.

102

103 # Job data

104 - `title` (str, optional): Title of the job.

105 - `description` (str, optional): Description of the job.

106 - `salary_min` (float, optional): Minimum salary of the job.

107 - `salary_max` (float, optional): Maximum salary of the job.

108 - `salary_currency` (str, optional): Salary currency

109 - `url` (str, optional): URL to the job posting.

110 - `raw_url` (str, optional): Raw URL to the job posting.

111 - `deadline` (datetime, optional): Deadline for the job.

112 - `company` (str, optional): Company name of the job.

113 - `location_postcode` (str, optional): Postcode of the job location.

114 - `location_city` (str, optional): City of the job location.

115 - `location_country` (str, optional): Country of the job location.

116 - `parsed_location` (str, optional): Parsed location of the job posting.

117 - `attendance_type` (str, optional): Attendance type of the job (e.g., remote, on-site).

118 - `is_closed` (bool): Indicates whether the job is closed.

119

120 Foreign keys:

121 -------------

122 - `service_log_id` (int): Identifier for the JobEmailScrapingServiceLog associated with the job.

123

124 Relationships:

125 --------------

126 - `emails` (list of JobEmail): List of email messages associated with the job.

127 - `service_log` (JobEmailScrapingServiceLog): JobEmailScrapingServiceLog object associated with the job.

128 - `job_rating` (JobRating): JobRating object associated with the job.

129

130 Constraints:

131 ------------

132 - Unique constraint on the combination of `external_job_id` and `owner_id` to ensure uniqueness per user."""

133

134 external_job_id = Column(String, nullable=False)

135 platform = Column(String, nullable=False)

136 is_processed = Column(Boolean, nullable=False, server_default=expression.false())

137 is_scraped = Column(Boolean, nullable=False, server_default=expression.false())

138 is_failed = Column(Boolean, nullable=False, server_default=expression.false())

139 scrape_error = Column(JSONB, server_default="[]", nullable=False)

140 is_skipped = Column(Boolean, nullable=False, server_default=expression.false())

141 skip_reason = Column(String, nullable=True)

142 scrape_datetime = Column(TIMESTAMP(timezone=True), nullable=True)

143 is_active = Column(Boolean, nullable=False, server_default=expression.true())

144 is_imported = Column(Boolean, nullable=False, server_default=expression.false())

145 retry_count = Column(Integer, nullable=False, server_default="0")

146 next_retry_at = Column(TIMESTAMP(timezone=True), nullable=True)

147

148 # Job data

149 title = Column(String, nullable=True)

150 description = Column(String, nullable=True)

151 salary_min = Column(Float, nullable=True)

152 salary_max = Column(Float, nullable=True)

153 salary_currency = Column(String, nullable=True)

154 url = Column(String, nullable=True)

155 raw_url = Column(String, nullable=True)

156 deadline = Column(TIMESTAMP(timezone=True), nullable=True)

157 company = Column(String, nullable=True)

158 location = Column(String, nullable=True)

159 location_postcode = Column(String, nullable=True)

160 location_city = Column(String, nullable=True)

161 location_country = Column(String, nullable=True)

162 parsed_location = Column(String, nullable=True)

163 attendance_type = Column(String, nullable=True)

164 is_closed = Column(Boolean, nullable=False, default=expression.false())

165

166 # Foreign keys

167 service_log_id = Column(

168 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="SET NULL"), nullable=False

169 )

170 exclusion_filter_id = Column(

171 Integer, ForeignKey("scraping_exclusion_filter.id", ondelete="SET NULL"), nullable=True

172 )

173 favourite_filter_id = Column(

174 Integer, ForeignKey("scraping_favourite_filter.id", ondelete="SET NULL"), nullable=True

175 )

176 geolocation_id = Column(Integer, ForeignKey("geolocation.id", ondelete="SET NULL"), nullable=True)

177

178 # Relationships

179 emails = relationship("JobEmail", secondary=jobemail_scrapedjob_mapping, back_populates="jobs")

180 service_log = relationship("JobEmailScrapingServiceLog", back_populates="scraped_jobs")

181 job_rating = relationship("JobRating", back_populates="scraped_job", uselist=False)

182 exclusion_filter = relationship("ScrapingExclusionFilter", back_populates="filtered_jobs")

183 favourite_filter = relationship("ScrapingFavouriteFilter", back_populates="filtered_jobs")

184 geolocation = relationship("Geolocation")

185

186 # Constraints

187 __table_args__ = (UniqueConstraint("external_job_id", "owner_id", name="unique_job_per_owner"),)

188

189

190class JobEmailScrapingServiceLog(ServiceLog, CommonBase, Base):

191 """Represents logs of service operations and their status.

192

193 Attributes:

194 -----------

195 - `user_found_ids` (list of int): List of user IDs found during the service run.

196 - `user_processed_ids` (list of int): List of user IDs processed during the service run.

197 - `email_found_n` (int): Number of emails found during the service run.

198

199 Relationships:

200 --------------

201 - `emails` (list of JobEmail): List of email messages associated with the service log.

202 - `scraped_jobs` (list of ScrapedJob): List of scraped jobs associated with the service log.

203 - `platform_stats` (list of JobEmailScrapingPlatformStat): List of platform statistics associated with the service log.

204 - `errors` (list of JobEmailScrapingServiceError): List of errors associated with the service log.

205

206 Properties:

207 -----------

208 - `job_scrape_succeeded_n` (int): Total successfully scraped jobs across all platforms.

209 - `job_scrape_failed_n` (int): Total failed scraped jobs across all platforms.

210 - `job_scrape_copied_n` (int): Total copied scraped jobs found across all platforms.

211 - `job_scrape_skipped_n` (int): Total skipped scraped jobs found across all platforms.

212 - `job_found_n` (int): Total jobs found (copied + skipped) across all platforms.

213 - `email_saved_n` (int): Total emails saved across all platforms.

214 - `email_skipped_n` (int): Total emails skipped across all platforms."""

215

216 # Users

217 user_found_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

218 user_processed_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

219

220 # Emails

221 email_found_n = Column(Integer, nullable=False, default=0)

222

223 # Relationships

224 emails = relationship("JobEmail", back_populates="service_log")

225 scraped_jobs = relationship("ScrapedJob", back_populates="service_log")

226 platform_stats = relationship("JobEmailScrapingPlatformStat", back_populates="service_log")

227 service_errors = relationship("JobEmailScrapingServiceError", back_populates="service_log")

228

229 def __init__(self, **kwargs) -> None:

230 """Initialise array fields with empty lists if not provided"""

231 kwargs.setdefault("user_found_ids", [])

232 kwargs.setdefault("user_processed_ids", [])

233 super().__init__(**kwargs)

234

235 @hybrid_property

236 def job_to_process_n(self) -> int:

237 """Total jobs to scrape across all platforms."""

238 return sum(len(stat.job_to_process_ids) for stat in self.platform_stats)

239

240 @hybrid_property

241 def job_scrape_succeeded_n(self) -> int:

242 """Total successfully scraped jobs across all platforms."""

243 return sum(len(stat.job_scrape_succeeded_ids) for stat in self.platform_stats)

244

245 @hybrid_property

246 def job_scrape_failed_n(self) -> int:

247 """Total failed scraped jobs across all platforms."""

248 return sum(len(stat.job_scrape_failed_ids) for stat in self.platform_stats)

249

250 @hybrid_property

251 def job_scrape_copied_n(self) -> int:

252 """Total copied scraped jobs found across all platforms."""

253 return sum(len(stat.job_scrape_copied_ids) for stat in self.platform_stats)

254

255 @hybrid_property

256 def job_scrape_skipped_n(self) -> int:

257 """Total skipped scraped jobs found across all platforms."""

258 return sum(len(stat.job_scrape_skipped_ids) for stat in self.platform_stats)

259

260 @hybrid_property

261 def job_found_n(self) -> int:

262 """Total jobs copied/skipped across all platforms."""

263 return sum(len(stat.job_found_ids) for stat in self.platform_stats)

264

265 @hybrid_property

266 def email_saved_n(self) -> int:

267 """Total emails saved across all platforms."""

268 return sum(len(stat.email_saved_ids) for stat in self.platform_stats)

269

270 @hybrid_property

271 def email_skipped_n(self) -> int:

272 """Total emails saved across all platforms."""

273 return sum(len(stat.email_skipped_ids) for stat in self.platform_stats)

274

275 @hybrid_property

276 def job_scrape_filtered_n(self) -> int:

277 """Total filtered scraped jobs across all platforms."""

278 return sum(len(stat.job_scrape_filtered_ids) for stat in self.platform_stats)

279

280

281class JobEmailScrapingPlatformStat(CommonBase, Base):

282 """Per-platform stats for a service run linked to an JobEmailScrapingServiceLog.

283

284 Attributes:

285 -----------

286 - `name` (str): Platform name (e.g. LinkedIn, Indeed).

287

288 # Emails

289 - `email_saved_ids` (list of int): List of saved email IDs.

290 - `email_skipped_ids` (list of int): List of skipped email IDs.

291

292 # Jobs

293 - `job_found_ids` (list of int): List of found job IDs from the emails.

294 - `job_to_process_ids` (list of int): List of job IDs to be processed.

295 - `job_scrape_failed_ids` (list of int): List of failed job scrape IDs.

296 - `job_scrape_succeeded_ids` (list of int): List of successful job scrape IDs.

297 - `job_scrape_copied_ids` (list of int): List of copied job scrape IDs.

298 - `job_scrape_skipped_ids` (list of int): List of skipped job scrape IDs.

299 - `job_scrape_filtered_ids` (list of int): List of filtered job scrape IDs.

300

301 Foreign keys:

302 -------------

303 - `service_log_id` (int): Foreign key to the associated JobEmailScrapingServiceLog.

304

305 Relationships:

306 --------------

307 - `service_log` (JobEmailScrapingServiceLog): Relationship to the associated JobEmailScrapingServiceLog.

308

309 Constraints:

310 ------------

311 - Unique constraint on the combination of `service_log_id` and `name` to ensure uniqueness per platform per service log.

312 """

313

314 name = Column(String, nullable=False)

315

316 # Emails

317 email_saved_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

318 email_skipped_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

319

320 # Jobs

321 job_found_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

322 job_to_process_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

323 job_scrape_failed_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

324 job_scrape_succeeded_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

325 job_scrape_copied_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

326 job_scrape_skipped_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

327 job_scrape_filtered_ids = Column(PG_ARRAY(Integer), server_default="{}", nullable=False)

328

329 # Foreign keys

330 service_log_id = Column(

331 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="CASCADE"), nullable=False

332 )

333

334 # Relationships

335 service_log = relationship("JobEmailScrapingServiceLog", back_populates="platform_stats")

336

337 # Constraints

338 __table_args__ = (UniqueConstraint("service_log_id", "name", name="unique_platform_per_service_log"),)

339

340 def __init__(self, **kwargs) -> None:

341 """Initialise array fields with empty lists if not provided"""

342 kwargs.setdefault("email_saved_ids", [])

343 kwargs.setdefault("email_skipped_ids", [])

344 kwargs.setdefault("job_found_ids", [])

345 kwargs.setdefault("job_to_process_ids", [])

346 kwargs.setdefault("job_scrape_failed_ids", [])

347 kwargs.setdefault("job_scrape_succeeded_ids", [])

348 kwargs.setdefault("job_scrape_copied_ids", [])

349 kwargs.setdefault("job_scrape_skipped_ids", [])

350 kwargs.setdefault("job_scrape_filtered_ids", [])

351 super().__init__(**kwargs)

352

353

354class JobEmailScrapingServiceError(CommonBase, Base):

355 """Records unexpected/unhandled errors raised during a service run.

356

357 Attributes:

358 -----------

359 - `error_type` (str): Type of the error.

360 - `message` (str, optional): Error message.

361 - `traceback` (str, optional): Traceback of the error.

362

363 Foreign keys:

364 -------------

365 - `service_log_id` (int): Foreign key to the associated JobEmailScrapingServiceLog.

366

367 Relationships:

368 --------------

369 - `service_log` (JobEmailScrapingServiceLog): Relationship to the associated JobEmailScrapingServiceLog"""

370

371 error_type = Column(String, nullable=False)

372 message = Column(String, nullable=False)

373 traceback = Column(String, nullable=False)

374

375 # Foreign keys

376 service_log_id = Column(

377 Integer, ForeignKey("job_email_scraping_service_log.id", ondelete="CASCADE"), nullable=False

378 )

379

380 # Relationships

381 service_log = relationship("JobEmailScrapingServiceLog", back_populates="service_errors")

382

383

384class Filter(object):

385 """Represents user-defined rules to filter scraped jobs

386

387 Attributes:

388 -----------

389 - `type` (str): Type of filter (title, company, location, salary, attendance_type).

390 - `operator` (str): Operator for the filter (contains, equals, starts_with, ends_with, less_than, greater_than).

391 - `value` (str): Value to match against.

392 - `case_sensitive` (bool): Whether string matching should be case-sensitive.

393 - `is_active` (bool): Whether the filter is active.

394

395 Constraints:

396 ------------

397 - Check constraint to ensure valid filter_type values.

398 - Check constraint to ensure valid filter_operator values."""

399

400 type = Column(String, nullable=False)

401 operator = Column(String, nullable=False)

402 value = Column(String, nullable=False)

403 case_sensitive = Column(Boolean, nullable=False, server_default=expression.false())

404 is_active = Column(Boolean, nullable=False, server_default=expression.true())

405

406 # Constraints

407 __table_args__ = (

408 CheckConstraint(

409 "type IN ('title', 'company', 'location', 'location_city', 'location_country', 'salary_min', 'salary_max', 'attendance_type')",

410 name="valid_filter_type",

411 ),

412 CheckConstraint(

413 "operator IN ('contains', 'equals', 'starts_with', 'ends_with', 'less_than', 'greater_than', 'not_contains', 'not_equals')",

414 name="valid_filter_operator",

415 ),

416 )

417

418 @hybrid_property

419 def name(self) -> str:

420 """Generate a human-readable name for the filter rule."""

421

422 case_str = " (case-sensitive)" if self.case_sensitive else ""

423 return f"{self.type} {self.operator} '{self.value}'{case_str}"

424

425

426class ScrapingExclusionFilter(Filter, Owned, Base):

427 """Represents user-defined rules to filter out scraped jobs."""

428

429 filtered_jobs = relationship("ScrapedJob", back_populates="exclusion_filter")

430

431

432class ScrapingFavouriteFilter(Filter, Owned, Base):

433 """Represents user-defined rules to filter favourite scraped jobs."""

434

435 filtered_jobs = relationship("ScrapedJob", back_populates="favourite_filter")

436

437

438class ForwardingConfirmationLink(Owned, Base):

439 """Represents a confirmation email sent to a user after setting up forwarding.

440

441 Attributes:

442 -----------

443 - `email_external_id` (str): External ID of the email.

444 - `url` (str): URL to the confirmation page.

445 - `platform` (str): Platform (e.g., Gmail).

446 - `is_used` (bool): Whether the confirmation link has been used"""

447

448 email_external_id = Column(String, nullable=False)

449 url = Column(String, nullable=False)

450 platform = Column(String, nullable=False)

451 is_used = Column(Boolean, nullable=False, server_default=expression.false())

Coverage for backend / app / job_email_scraping / models.py: 99%

153 statements