Skip to content

Commit 116cba6

Browse files
committed
chore: add duplicate organization pruning script
1 parent 53c6548 commit 116cba6

8 files changed

Lines changed: 151 additions & 119 deletions

File tree

services/apps/script_executor_worker/src/activities.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@ import {
22
blockMemberOrganizationAffiliation,
33
getOrganizationMembers,
44
} from './activities/block-organization-affiliation'
5-
import {
6-
findDuplicateMembersAfterDate,
7-
moveMemberActivityRelations,
8-
} from './activities/cleanup/duplicate-members'
95
import { deleteMember, getMembersToCleanup, syncRemoveMember } from './activities/cleanup/member'
106
import {
117
deleteOrganization,
@@ -44,6 +40,10 @@ import {
4440
findMembersWithSameVerifiedEmailsInDifferentPlatforms,
4541
} from './activities/merge-members-with-similar-identities'
4642
import { getUnprocessedLLMApprovedSuggestions } from './activities/process-llm-verified-merges'
43+
import {
44+
getOrganizationsToPrune,
45+
pruneOrganization,
46+
} from './activities/prune-duplicate-organizations'
4747
import { deleteIndexedEntities } from './activities/sync/entity-index'
4848
import { getMembersForSync, syncMembersBatch } from './activities/sync/member'
4949
import { getOrganizationsForSync, syncOrganizationsBatch } from './activities/sync/organization'
@@ -78,12 +78,12 @@ export {
7878
deleteIndexedEntities,
7979
getUnprocessedLLMApprovedSuggestions,
8080
getWorkflowsCount,
81-
findDuplicateMembersAfterDate,
82-
moveMemberActivityRelations,
8381
getBotMembersWithOrgAffiliation,
8482
removeBotMemberOrganization,
8583
unlinkOrganizationFromBotActivities,
8684
blockMemberOrganizationAffiliation,
8785
getOrganizationMembers,
8886
calculateMemberAffiliations,
87+
getOrganizationsToPrune,
88+
pruneOrganization,
8989
}

services/apps/script_executor_worker/src/activities/cleanup/duplicate-members.ts

Lines changed: 0 additions & 41 deletions
This file was deleted.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import OrganizationRepository from '@crowd/data-access-layer/src/old/apps/script_executor_worker/organization.repo'
2+
3+
import { svc } from '../main'
4+
5+
export async function pruneOrganization(orgId: string): Promise<void> {
6+
try {
7+
const orgRepo = new OrganizationRepository(svc.postgres.writer.connection(), svc.log)
8+
await orgRepo.pruneOrganization(orgId)
9+
} catch (error) {
10+
svc.log.error(error, 'Error pruning organization in database!')
11+
throw error
12+
}
13+
}
14+
15+
export async function getOrganizationsToPrune(
16+
batchSize: number,
17+
): Promise<{ id: string; displayName: string }[]> {
18+
try {
19+
const orgRepo = new OrganizationRepository(svc.postgres.reader.connection(), svc.log)
20+
return orgRepo.getOrganizationsToPrune(batchSize)
21+
} catch (error) {
22+
svc.log.error(error, 'Error getting organizations to prune!')
23+
throw error
24+
}
25+
}

services/apps/script_executor_worker/src/types.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,6 @@ export interface IProcessLLMVerifiedMergesArgs extends IScriptBatchTestArgs {
4646
type: string
4747
}
4848

49-
export interface ICleanupDuplicateMembersArgs extends IScriptBatchTestArgs {
50-
cutoffDate?: string
51-
checkByActivityIdentity?: boolean
52-
checkByTwitterIdentity?: boolean
53-
}
54-
5549
export interface IDedupActivityRelationsArgs extends IScriptBatchTestArgs {
5650
groupsPerRun?: number
5751
cursor?: Omit<IActivityRelationDuplicateGroup, 'activityIds'>

services/apps/script_executor_worker/src/workflows.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import { blockOrganizationAffiliation } from './workflows/block-organization-affiliation'
2-
import { cleanupDuplicateMembers } from './workflows/cleanup/duplicate-members'
32
import { cleanupMembers } from './workflows/cleanup/members'
43
import { cleanupOrganizations } from './workflows/cleanup/organizations'
54
import { dissectMember } from './workflows/dissectMember'
@@ -8,6 +7,7 @@ import { findAndMergeMembersWithSameVerifiedEmailsInDifferentPlatforms } from '.
87
import { fixBotMembersAffiliation } from './workflows/fix-bot-members-affiliation'
98
import { fixOrgIdentitiesWithWrongUrls } from './workflows/fixOrgIdentitiesWithWrongUrls'
109
import { processLLMVerifiedMerges } from './workflows/processLLMVerifiedMerges'
10+
import { pruneDuplicateOrganizations } from './workflows/prune-duplicate-organizations'
1111
import { syncMembers } from './workflows/sync/members'
1212
import { syncOrganizations } from './workflows/sync/organizations'
1313

@@ -21,7 +21,7 @@ export {
2121
cleanupMembers,
2222
cleanupOrganizations,
2323
processLLMVerifiedMerges,
24-
cleanupDuplicateMembers,
24+
pruneDuplicateOrganizations,
2525
fixBotMembersAffiliation,
2626
blockOrganizationAffiliation,
2727
}

services/apps/script_executor_worker/src/workflows/cleanup/duplicate-members.ts

Lines changed: 0 additions & 64 deletions
This file was deleted.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import { proxyActivities } from '@temporalio/workflow'
2+
3+
import * as activities from '../activities'
4+
import { IScriptBatchTestArgs } from '../types'
5+
import { chunkArray } from '../utils/common'
6+
7+
const { getOrganizationsToPrune, pruneOrganization, syncRemoveOrganization } = proxyActivities<
8+
typeof activities
9+
>({
10+
startToCloseTimeout: '30 minutes',
11+
retry: { maximumAttempts: 3, backoffCoefficient: 3 },
12+
})
13+
14+
export async function pruneDuplicateOrganizations(args: IScriptBatchTestArgs): Promise<void> {
15+
const BATCH_SIZE = args.batchSize ?? 100
16+
17+
const organizationsToPrune = await getOrganizationsToPrune(BATCH_SIZE)
18+
19+
if (organizationsToPrune.length === 0) {
20+
console.log('No more organizations to prune!')
21+
return
22+
}
23+
24+
const CHUNK_SIZE = 25
25+
26+
for (const chunk of chunkArray(organizationsToPrune, CHUNK_SIZE)) {
27+
const cleanupTasks = chunk.map(async (o) => {
28+
console.log('Pruning organization', o.displayName)
29+
await syncRemoveOrganization(o.id)
30+
return pruneOrganization(o.id)
31+
})
32+
33+
await Promise.all(cleanupTasks).catch((err) => {
34+
console.error('Error pruning organizations!', err)
35+
})
36+
}
37+
}

services/libs/data-access-layer/src/old/apps/script_executor_worker/organization.repo.ts

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,87 @@ class OrganizationRepository {
179179
{ organizationId, limit, offset },
180180
)
181181
}
182+
183+
public async pruneOrganization(organizationId: string): Promise<void> {
184+
const tablesToDelete = [
185+
{ name: 'organizationNoMerge', conditions: ['organizationId', 'noMergeId'] },
186+
{ name: 'organizationToMerge', conditions: ['organizationId', 'toMergeId'] },
187+
{ name: 'organizationToMergeRaw', conditions: ['organizationId', 'toMergeId'] },
188+
{ name: 'organizationEnrichmentCache', conditions: ['organizationId'] },
189+
{ name: 'organizationEnrichments', conditions: ['organizationId'] },
190+
{ name: 'memberSegmentAffiliations', conditions: ['organizationId'] },
191+
{ name: 'organizationSegmentsAgg', conditions: ['organizationId'] },
192+
{ name: 'organizationSegments', conditions: ['organizationId'] },
193+
{ name: 'orgAttributes', conditions: ['organizationId'] },
194+
{ name: 'organizationIdentities', conditions: ['organizationId'] },
195+
{ name: 'memberOrganizations', conditions: ['organizationId'] },
196+
{ name: 'organizations', conditions: ['id'] },
197+
]
198+
199+
await this.connection.tx(async (tx) => {
200+
for (const table of tablesToDelete) {
201+
const whereClause = table.conditions
202+
.map((field) => `"${field}" = $(organizationId)`)
203+
.join(' OR ')
204+
await tx.none(`DELETE FROM "${table.name}" WHERE ${whereClause}`, { organizationId })
205+
}
206+
})
207+
}
208+
209+
public async getOrganizationsToPrune(
210+
batchSize: number,
211+
): Promise<{ id: string; displayName: string }[]> {
212+
return this.connection.query(
213+
`
214+
WITH email_providers AS (
215+
SELECT unnest(ARRAY[
216+
'gmail.com', 'gmail.co.uk', 'gmail.com.au', 'gmail.com.tr',
217+
'yahoo.com', 'yahoo.co.uk', 'yahoo.com.br', 'yahoo.co.in', 'yahoo.fr', 'yahoo.es', 'yahoo.it', 'yahoo.de', 'yahoo.ca', 'yahoo.com.au', 'yahoo.in', 'yahoo.co.jp', 'yahoo.com.ar', 'yahoo.com.mx', 'yahoo.co.id', 'yahoo.com.sg', 'yahoo.co.za', 'yahoo.com.ph', 'yahoo.com.tw', 'yahoo.com.hk', 'yahoo.com.vn',
218+
'hotmail.com', 'hotmail.co.uk', 'hotmail.fr', 'hotmail.ca', 'hotmail.it', 'hotmail.es', 'hotmail.de', 'hotmail.com.au', 'hotmail.com.mx',
219+
'icloud.com', 'icloud.com.cn',
220+
'fastmail.com', 'tutanota.com', 'tuta.io',
221+
'gmx.com', 'gmx.de', 'gmx.net', 'gmx.at', 'gmx.ch', 'gmx.fr', 'gmx.co.uk',
222+
'aol.com', 'aol.co.uk', 'aol.fr', 'aol.de',
223+
'msn.com', 'wanadoo.fr', 'orange.fr', 'comcast.net',
224+
'live.com', 'live.co.uk', 'live.fr', 'live.nl', 'live.it', 'live.com.au', 'live.ca', 'live.cn',
225+
'rediffmail.com', 'sify.com', 'indiatimes.com', 'free.fr', 'web.de',
226+
'yandex.ru', 'yandex.com', 'yandex.com.tr', 'ya.ru',
227+
'ymail.com', 'libero.it',
228+
'outlook.com', 'outlook.fr', 'outlook.co.uk', 'outlook.de', 'outlook.es', 'outlook.it', 'outlook.com.au', 'outlook.com.br', 'outlook.com.mx', 'outlook.co.jp', 'outlook.in', 'outlook.com.sg', 'outlook.co.za', 'outlook.co.in',
229+
'uol.com.br', 'bol.com.br',
230+
'mail.ru', 'inbox.ru', 'list.ru', 'bk.ru',
231+
'mail.com', 'mail.de', 'mail.co.uk',
232+
'cox.net', 'sbcglobal.net', 'sfr.fr', 'verizon.net', 'googlemail.com', 'ig.com.br', 'bigpond.com', 'bigpond.net.au', 'terra.com.br', 'neuf.fr', 'alice.it', 'rocketmail.com', 'att.net', 'laposte.net', 'bellsouth.net', 'charter.net', 'rambler.ru', 'tiscali.it', 'tiscali.co.uk', 'shaw.ca', 'sky.com', 'earthlink.net', 'optonline.net', 'freenet.de', 't-online.de', 'aliceadsl.fr', 'virgilio.it', 'home.nl', 'qq.com', 'vip.qq.com', 'telenet.be', 'pandora.be', 'me.com', 'voila.fr', 'planet.nl', 'tin.it', 'ntlworld.com', 'arcor.de', 'frontiernet.net', 'hetnet.nl', 'zonnet.nl', 'club-internet.fr', 'juno.com', 'optusnet.com.au', 'blueyonder.co.uk', 'bluewin.ch', 'skynet.be', 'sympatico.ca', 'windstream.net', 'mac.com', 'centurytel.net', 'chello.nl', 'aim.com',
233+
'protonmail.com', 'protonmail.ch', 'proton.me', 'pm.me', 'duck.com',
234+
'zoho.com', 'zohomail.com',
235+
'users.noreply.github.com',
236+
'126.com', '139.com', '163.com', '188.com', 'foxmail.com', 'tom.com', '21cn.com', 'yeah.net',
237+
'naver.com', 'daum.net', 'hanmail.net',
238+
'hey.com', 'inbox.com', 'lycos.com', 'excite.com', 'hushmail.com', 'mailfence.com', 'mailbox.org', 'posteo.de', 'startmail.com', 'runbox.com', 'countermail.com', 'mynet.com',
239+
'wp.pl', 'onet.pl', 'interia.pl', 'o2.pl',
240+
'seznam.cz', 'centrum.cz',
241+
'mailinator.com', 'guerrillamail.com', '10minutemail.com', 'tempmail.com'
242+
]) AS provider
243+
)
244+
SELECT DISTINCT o.id, o."displayName"
245+
FROM organizations o
246+
INNER JOIN "organizationIdentities" oi
247+
ON o.id = oi."organizationId"
248+
INNER JOIN email_providers ep
249+
ON LOWER(oi.value) = ep.provider
250+
WHERE o."deletedAt" IS NULL
251+
AND oi.type = 'primary-domain'
252+
AND NOT EXISTS (
253+
SELECT 1
254+
FROM "memberOrganizations" mo
255+
WHERE mo."organizationId" = o.id
256+
AND (mo.title IS NOT NULL AND mo.title != '')
257+
AND (mo.source IS NOT NULL AND mo.source NOT IN ('email-domain'))
258+
)
259+
`,
260+
{ batchSize },
261+
)
262+
}
182263
}
183264

184265
export default OrganizationRepository

0 commit comments

Comments
 (0)