Skip to content

Commit 5fdebe6

Browse files
committed
Move "crawl installation" to a single Lambda call
Adding public repos brings more business logic in, which is always a pain with Step Functions. Also since Github "best practices" recommend against parallel API calls we probably don't want to parallelize various tasks anyway. There's some benefit to being able to do retries at a lower level, but errors should be rare anyway. I may roll this back in future, but for now I think this is easier.
1 parent c6f8ad2 commit 5fdebe6

File tree

10 files changed

+62
-187
lines changed

10 files changed

+62
-187
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import { AppState } from '../../../environment/AppState'
2+
import { GithubInstallation } from '../../types/GithubInstallation'
3+
import { crawlUsers } from './crawlUsers'
4+
import { crawlRepositories } from './crawlRepositories'
5+
import { crawlPushes } from './crawlPushes'
6+
import { crawlWorkflowRunEvents } from './crawlRunEvents'
7+
import { logger } from '../../../util/logging'
8+
9+
export async function crawlInstallation(
10+
appState: AppState,
11+
installation: GithubInstallation,
12+
lookbackDays: number
13+
) {
14+
logger.info(`Crawling Installation for ${installation.accountLogin}`)
15+
await crawlUsers(appState, installation)
16+
const repos = await crawlRepositories(appState, installation)
17+
// Eventually consider doing some parallelization here (or move back to step function) but
18+
// need to be careful since GitHub gets twitchy about concurrent requests to the API
19+
// Their "best practice" doc says don't do it, but their rate limit doc says it's supported
20+
// Only really need to care if things start getting slow
21+
for (const repo of repos) {
22+
await crawlPushes(appState, installation, repo)
23+
await crawlWorkflowRunEvents(appState, installation, repo, lookbackDays)
24+
}
25+
}

src/app/domain/github/crawler/crawlInstallations.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ import { processRawInstallation } from '../githubInstallation'
22
import { AppState } from '../../../environment/AppState'
33
import { removeNullAndUndefined } from '../../../util/collections'
44
import { GithubInstallation } from '../../types/GithubInstallation'
5+
import { logger } from '../../../util/logging'
56

67
export async function crawlInstallations(appState: AppState): Promise<GithubInstallation[]> {
8+
logger.info(`Crawling Installations`)
79
const installations = await appState.githubClient.listInstallations()
810

911
return removeNullAndUndefined(

src/app/domain/github/crawler/crawlPushes.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { isRawGithubPushEventEvent } from '../../types/rawGithub/RawGithubAPIPus
44
import { fromRawGithubPushEventEvent, GithubPush } from '../../types/GithubPush'
55
import { processPushes } from '../githubPush'
66
import { GithubInstallation } from '../../types/GithubInstallation'
7+
import { logger } from '../../../util/logging'
78

89
// TOEventually - only get all pushes back to lookback in crawl configuration, however GitHub doesn't keep
910
// them around for very long
@@ -13,6 +14,7 @@ export async function crawlPushes(
1314
installation: GithubInstallation,
1415
repo: GithubRepositorySummary
1516
) {
17+
logger.info(`Crawling Pushes for ${installation.accountLogin}/${repo.name}`)
1618
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
1719
const allEventsForRepo = await githubClient.listMostRecentEventsForRepo(repo.ownerName, repo.name)
1820
const rawPushes = allEventsForRepo.filter(isRawGithubPushEventEvent)

src/app/domain/github/crawler/crawlRepositories.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ import { GithubInstallation } from '../../types/GithubInstallation'
33
import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient'
44
import { processRawRepositories, toRepositorySummary } from '../githubRepository'
55
import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/GithubAccountType'
6+
import { logger } from '../../../util/logging'
67

78
export async function crawlRepositories(appState: AppState, installation: GithubInstallation) {
9+
logger.info(`Crawling Repositories for ${installation.accountLogin}`)
810
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
911
const latestRawRepositories = await readRawRepositories(installation, githubClient)
1012
const repos = await processRawRepositories(appState, latestRawRepositories)

src/app/domain/github/crawler/crawlRunEvents.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { GithubRepositorySummary } from '../../types/GithubRepository'
33
import { dateTimeAddDays } from '../../../util/dateAndTime'
44
import { processRawRunEvents } from '../githubWorkflowRunEvent'
55
import { GithubInstallation } from '../../types/GithubInstallation'
6+
import { logger } from '../../../util/logging'
67

78
export async function crawlWorkflowRunEvents(
89
appState: AppState,
@@ -11,6 +12,7 @@ export async function crawlWorkflowRunEvents(
1112
repo: GithubRepositorySummary,
1213
lookbackDays: number
1314
) {
15+
logger.info(`Crawling Run Events for ${installation.accountLogin}/${repo.name}`)
1416
const githubClient = appState.githubClient.clientForInstallation(installation.installationId)
1517
const startTime = `${dateTimeAddDays(appState.clock.now(), -1 * lookbackDays).toISOString()}`
1618

src/app/domain/github/crawler/crawlUsers.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ import { GithubInstallation } from '../../types/GithubInstallation'
33
import { GithubInstallationClient } from '../../../outboundInterfaces/githubInstallationClient'
44
import { processRawUsers } from '../githubUser'
55
import { ORGANIZATION_ACCOUNT_TYPE, USER_ACCOUNT_TYPE } from '../../types/GithubAccountType'
6+
import { logger } from '../../../util/logging'
67

78
export async function crawlUsers(appState: AppState, installation: GithubInstallation) {
9+
logger.info(`Crawling Users for ${installation.accountLogin}`)
810
const latestRawUsers = await readRawUsers(
911
installation,
1012
appState.githubClient.clientForInstallation(installation.installationId)
Lines changed: 15 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,37 @@
11
import { GithubInstallation, isGithubInstallation } from '../../domain/types/GithubInstallation'
2-
import { GithubRepositorySummary, isGithubRepositorySummary } from '../../domain/types/GithubRepository'
32
import { throwError } from '@symphoniacloud/dynamodb-entity-store'
43
import {
54
CRAWLABLE_RESOURCES,
65
CrawlableResource,
76
isCrawlableResource
8-
} from '../../../multipleContexts/githubCrawler' // TOEventually - safer type checking here
9-
10-
// TOEventually - safer type checking here
7+
} from '../../../multipleContexts/githubCrawler'
8+
import { isNotNullObject } from '../../util/types' // TOEventually - safer type checking here
119

1210
export type CrawlEvent = { resourceType: CrawlableResource }
13-
type CrawlEventWithInstallation = CrawlEvent & { installation: GithubInstallation }
14-
type CrawlEventWithRepositorySummary = CrawlEvent & { repository: GithubRepositorySummary }
1511

1612
export function isCrawlEvent(x: unknown): x is CrawlEvent {
17-
return x !== undefined && isCrawlableResource((x as CrawlEvent).resourceType)
18-
}
19-
20-
export function isCrawlEventWithInstallation(x: CrawlEvent): x is CrawlEventWithInstallation {
21-
const candidate = x as CrawlEventWithInstallation
22-
return candidate.installation && isGithubInstallation(candidate.installation)
23-
}
24-
25-
export function isCrawlEventWithRepositorySummary(x: CrawlEvent): x is CrawlEventWithRepositorySummary {
26-
const candidate = x as CrawlEventWithRepositorySummary
27-
return candidate.repository && isGithubRepositorySummary(candidate.repository)
13+
return isNotNullObject(x) && 'resourceType' in x && isCrawlableResource(x.resourceType)
2814
}
2915

3016
export type CrawlInstallationsEvent = { resourceType: 'installations' }
31-
export type CrawlUsersEvent = { resourceType: 'users' } & CrawlEventWithInstallation
32-
export type CrawlRepositoriesEvent = { resourceType: 'repositories' } & CrawlEventWithInstallation
33-
export type CrawlPushesEvent = { resourceType: 'pushes' } & CrawlEventWithInstallation &
34-
CrawlEventWithRepositorySummary
35-
export type CrawlWorkflowRunEventsEvent = {
36-
resourceType: 'pushes'
17+
18+
export type CrawlInstallationEvent = {
19+
resourceType: 'installation'
20+
installation: GithubInstallation
3721
lookbackDays: number
38-
} & CrawlEventWithInstallation &
39-
CrawlEventWithRepositorySummary
22+
}
4023

4124
export function isCrawlInstallationsEvent(x: CrawlEvent): x is CrawlInstallationsEvent {
4225
return x.resourceType === CRAWLABLE_RESOURCES.INSTALLATIONS
4326
}
4427

45-
export function isCrawlUsersEvent(x: CrawlEvent): x is CrawlUsersEvent {
46-
if (x.resourceType !== CRAWLABLE_RESOURCES.USERS) return false
47-
return (
48-
isCrawlEventWithInstallation(x) ||
49-
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.USERS} : ${JSON.stringify(x)}`)()
50-
)
51-
}
52-
53-
export function isCrawlRepositoriesEvent(x: CrawlEvent): x is CrawlRepositoriesEvent {
54-
if (x.resourceType !== CRAWLABLE_RESOURCES.REPOSITORIES) return false
55-
return (
56-
isCrawlEventWithInstallation(x) ||
57-
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.REPOSITORIES} : ${JSON.stringify(x)}`)()
58-
)
59-
}
60-
61-
export function isCrawlPushesEvent(x: CrawlEvent): x is CrawlPushesEvent {
62-
if (x.resourceType !== CRAWLABLE_RESOURCES.PUSHES) return false
63-
return (
64-
(isCrawlEventWithInstallation(x) && isCrawlEventWithRepositorySummary(x)) ||
65-
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.PUSHES} : ${JSON.stringify(x)}`)()
66-
)
67-
}
68-
69-
export function isCrawlWorkflowRunEventsEvent(x: CrawlEvent): x is CrawlWorkflowRunEventsEvent {
70-
if (x.resourceType !== CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS) return false
71-
const hasLookBackDays = typeof (x as CrawlWorkflowRunEventsEvent).lookbackDays !== undefined
28+
export function isCrawlInstallationEvent(x: CrawlEvent): x is CrawlInstallationEvent {
29+
if (x.resourceType !== CRAWLABLE_RESOURCES.INSTALLATION) return false
7230
return (
73-
(hasLookBackDays && isCrawlEventWithInstallation(x) && isCrawlEventWithRepositorySummary(x)) ||
74-
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS} : ${JSON.stringify(x)}`)()
31+
('installation' in x &&
32+
isGithubInstallation(x.installation) &&
33+
'lookbackDays' in x &&
34+
typeof x.lookbackDays === 'number') ||
35+
throwError(`Invalid object for ${CRAWLABLE_RESOURCES.INSTALLATION} : ${JSON.stringify(x)}`)()
7536
)
7637
}

src/app/lambdaFunctions/githubCrawlTask/lambda.ts

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,9 @@ import middy from '@middy/core'
55
import { powertoolsMiddlewares } from '../../middleware/standardMiddleware'
66
import { logger } from '../../util/logging'
77
import { isFailure } from '../../util/structuredResult'
8-
import { crawlPushes } from '../../domain/github/crawler/crawlPushes'
9-
import { crawlRepositories } from '../../domain/github/crawler/crawlRepositories'
108
import { crawlInstallations } from '../../domain/github/crawler/crawlInstallations'
11-
import { crawlUsers } from '../../domain/github/crawler/crawlUsers'
12-
import {
13-
isCrawlEvent,
14-
isCrawlInstallationsEvent,
15-
isCrawlPushesEvent,
16-
isCrawlRepositoriesEvent,
17-
isCrawlUsersEvent,
18-
isCrawlWorkflowRunEventsEvent
19-
} from './githubCrawlTaskEvents'
20-
import { crawlWorkflowRunEvents } from '../../domain/github/crawler/crawlRunEvents'
9+
import { isCrawlEvent, isCrawlInstallationEvent, isCrawlInstallationsEvent } from './githubCrawlTaskEvents'
10+
import { crawlInstallation } from '../../domain/github/crawler/crawlInstallation'
2111

2212
let appState: AppState
2313

@@ -40,20 +30,8 @@ export const baseHandler: Handler<unknown, unknown> = async (event) => {
4030
return await crawlInstallations(appState)
4131
}
4232

43-
if (isCrawlUsersEvent(event)) {
44-
return await crawlUsers(appState, event.installation)
45-
}
46-
47-
if (isCrawlRepositoriesEvent(event)) {
48-
return await crawlRepositories(appState, event.installation)
49-
}
50-
51-
if (isCrawlPushesEvent(event)) {
52-
return await crawlPushes(appState, event.installation, event.repository)
53-
}
54-
55-
if (isCrawlWorkflowRunEventsEvent(event)) {
56-
return await crawlWorkflowRunEvents(appState, event.installation, event.repository, event.lookbackDays)
33+
if (isCrawlInstallationEvent(event)) {
34+
return await crawlInstallation(appState, event.installation, event.lookbackDays)
5735
}
5836

5937
throw new Error(`unknown event format: ${event}`)

src/cdk/stacks/main/githubCrawlers.ts

Lines changed: 7 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,20 @@ import {
55
DefinitionBody,
66
IntegrationPattern,
77
JsonPath,
8-
LogLevel,
98
Map,
109
StateMachine,
11-
StateMachineType,
1210
TaskInput
1311
} from 'aws-cdk-lib/aws-stepfunctions'
1412
import { LambdaInvoke, StepFunctionsStartExecution } from 'aws-cdk-lib/aws-stepfunctions-tasks'
1513
import { CRAWLABLE_RESOURCES } from '../../../multipleContexts/githubCrawler'
1614
import { Rule, Schedule } from 'aws-cdk-lib/aws-events'
1715
import { SfnStateMachine } from 'aws-cdk-lib/aws-events-targets'
1816
import { EVENTBRIDGE_DETAIL_TYPES } from '../../../multipleContexts/eventBridge'
19-
import { Duration, RemovalPolicy } from 'aws-cdk-lib'
20-
import { LogGroup } from 'aws-cdk-lib/aws-logs'
17+
import { Duration } from 'aws-cdk-lib'
2118

2219
export function defineGithubCrawlers(scope: Construct, props: MainStackProps) {
2320
const crawlerFunction = defineGithubCrawlerFunction(scope, props)
24-
const reposChildrenCrawler = defineReposChildrenCrawler(scope, props, crawlerFunction)
25-
const installationCrawler = defineInstallationCrawler(scope, props, crawlerFunction, reposChildrenCrawler)
21+
const installationCrawler = defineInstallationCrawler(scope, props, crawlerFunction)
2622
const allInstallationsCrawler = defineAllInstallationsCrawler(
2723
scope,
2824
props,
@@ -52,114 +48,22 @@ function defineGithubCrawlerFunction(scope: Construct, props: MainStackProps) {
5248
)
5349
}
5450

55-
function defineReposChildrenCrawler(
56-
scope: Construct,
57-
props: MainStackProps,
58-
crawlerFunction: CicadaFunction
59-
) {
60-
const crawlPushes = new LambdaInvoke(scope, 'crawlPushes', {
61-
lambdaFunction: crawlerFunction,
62-
payload: TaskInput.fromObject({
63-
resourceType: CRAWLABLE_RESOURCES.PUSHES,
64-
installation: JsonPath.objectAt('$.installation'),
65-
repository: JsonPath.objectAt('$.repository')
66-
}),
67-
// Pass through original input to next state
68-
resultPath: JsonPath.DISCARD
69-
})
70-
71-
const crawlWorkflowRunEvents = new LambdaInvoke(scope, 'crawlWorkflowRunEvents', {
51+
function defineInstallationCrawler(scope: Construct, props: MainStackProps, crawlerFunction: CicadaFunction) {
52+
const crawlInstallation = new LambdaInvoke(scope, 'crawlInstallation', {
7253
lambdaFunction: crawlerFunction,
7354
payload: TaskInput.fromObject({
74-
resourceType: CRAWLABLE_RESOURCES.WORKFLOW_RUN_EVENTS,
75-
installation: JsonPath.objectAt('$.installation'),
76-
repository: JsonPath.objectAt('$.repository'),
77-
lookbackDays: JsonPath.numberAt('$$.Execution.Input.lookbackDays')
78-
}),
79-
resultPath: JsonPath.DISCARD
80-
})
81-
82-
// TOEventually - need to consider github app rate limiting (max 5000 requests / hour, etc.)
83-
// TOEventually - as part of rate limiting use conditional requests, look at returned quota data, etc.
84-
const forEachRepository = new Map(scope, 'forEachRepository', {
85-
maxConcurrency: 10,
86-
itemsPath: '$.repositories',
87-
itemSelector: {
55+
resourceType: CRAWLABLE_RESOURCES.INSTALLATION,
8856
installation: JsonPath.objectAt('$.installation'),
89-
repository: JsonPath.objectAt('$$.Map.Item.Value')
90-
}
91-
})
92-
forEachRepository.itemProcessor(crawlPushes.next(crawlWorkflowRunEvents))
93-
94-
return new StateMachine(scope, 'repoElementsCrawler', {
95-
stateMachineName: `${props.appName}-repositories-elements-crawler`,
96-
stateMachineType: StateMachineType.EXPRESS,
97-
// Need to configure logs because Express Workflows don't have any diagnotics otherwise
98-
logs: {
99-
level: LogLevel.ALL,
100-
destination: new LogGroup(scope, 'repoElementsCrawlerLogGroup', {
101-
logGroupName: `${props.appName}-repositories-elements-crawler`,
102-
removalPolicy: RemovalPolicy.DESTROY,
103-
retention: props.logRetention
104-
})
105-
},
106-
comment: 'Crawl child objects of a list of repositories (Express)',
107-
definitionBody: DefinitionBody.fromChainable(forEachRepository),
108-
tracingEnabled: true
109-
})
110-
}
111-
112-
function defineInstallationCrawler(
113-
scope: Construct,
114-
props: MainStackProps,
115-
crawlerFunction: CicadaFunction,
116-
reposChildrenCrawler: StateMachine
117-
) {
118-
const crawlUsers = new LambdaInvoke(scope, 'crawlUsers', {
119-
lambdaFunction: crawlerFunction,
120-
payload: TaskInput.fromObject({
121-
resourceType: CRAWLABLE_RESOURCES.USERS,
122-
installation: JsonPath.objectAt('$.installation')
57+
lookbackDays: JsonPath.numberAt('$.lookbackDays')
12358
}),
12459
// Pass through original input to next state
12560
resultPath: JsonPath.DISCARD
12661
})
12762

128-
const crawlRepositories = new LambdaInvoke(scope, 'crawlRepositories', {
129-
lambdaFunction: crawlerFunction,
130-
payload: TaskInput.fromObject({
131-
resourceType: CRAWLABLE_RESOURCES.REPOSITORIES,
132-
installation: JsonPath.objectAt('$.installation')
133-
}),
134-
resultSelector: {
135-
repositories: JsonPath.objectAt('$.Payload')
136-
},
137-
resultPath: '$.repositoriesCrawler'
138-
})
139-
140-
// TOEventually - will need to partition set of repositories due to either max timeout of
141-
// underlying express workflow OR because of size of request.
142-
const invokeReposChildrenCrawler = new StepFunctionsStartExecution(
143-
scope,
144-
'installationInvokeReposChildrenCrawler',
145-
{
146-
stateMachine: reposChildrenCrawler,
147-
integrationPattern: IntegrationPattern.RUN_JOB,
148-
associateWithParent: true,
149-
input: TaskInput.fromObject({
150-
installation: JsonPath.objectAt('$.installation'),
151-
repositories: JsonPath.objectAt('$.repositoriesCrawler.repositories'),
152-
lookbackDays: JsonPath.numberAt('$.lookbackDays')
153-
})
154-
}
155-
)
156-
157-
const workflow = crawlUsers.next(crawlRepositories).next(invokeReposChildrenCrawler)
158-
15963
return new StateMachine(scope, 'installationCrawler', {
16064
stateMachineName: `${props.appName}-installation`,
16165
comment: 'Crawl a GitHub App Installation and child resources',
162-
definitionBody: DefinitionBody.fromChainable(workflow),
66+
definitionBody: DefinitionBody.fromChainable(crawlInstallation),
16367
tracingEnabled: true
16468
})
16569
}

src/multipleContexts/githubCrawler.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
export const CRAWLABLE_RESOURCES = {
22
INSTALLATIONS: 'installations',
3-
USERS: 'users',
4-
REPOSITORIES: 'repositories',
5-
PUSHES: 'pushes',
6-
WORKFLOW_RUN_EVENTS: 'workflowRunEvents'
3+
INSTALLATION: 'installation'
74
} as const
85

96
export type CrawlableResource = (typeof CRAWLABLE_RESOURCES)[keyof typeof CRAWLABLE_RESOURCES]

0 commit comments

Comments
 (0)