Skip to content

Commit b76602d

Browse files
author
Oleksandr Pavlov
committed
up
1 parent 6590533 commit b76602d

File tree

5 files changed

+40
-33
lines changed

5 files changed

+40
-33
lines changed

Examples/WebReaper.ConsoleApplication/Program.cs

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,17 @@
11
using WebReaper.Builders;
22

33
var engine = await new ScraperEngineBuilder()
4-
.GetWithBrowser(
5-
new[] { "https://www.reddit.com/r/dotnet/" },
6-
actions => actions
7-
.ScrollToEnd()
8-
.RepeatWithDelay(10, 2000)
9-
.Build())
10-
.Follow("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE")
4+
.GetWithBrowser("https://www.alexpavlov.dev/blog")
5+
.FollowWithBrowser(".text-gray-900.transition")
116
.Parse(new()
127
{
13-
new("title", "._eYtD2XCVieq6emjKBH3m"),
14-
new("text", "._3xX726aBn29LDbsDtzr_6E._1Ap4F5maDtT1E1YuCiaO0r.D3IL3FD0RFy_mkKLPwL4")
8+
new("title", ".text-3xl.font-bold"),
9+
new("text", ".max-w-max.prose.prose-dark")
1510
})
16-
.WriteToJsonFile("data/output.json", dataCleanupOnStart: true)
17-
.TrackVisitedLinksInFile("data/visited.txt", dataCleanupOnStart: true)
18-
.WithTextFileScheduler("data/jobs.txt", "data/currentJob.txt", dataCleanupOnStart: true)
19-
.WithFileConfigStorage("data/config.txt")
20-
.LogToConsole()
21-
.PageCrawlLimit(500)
22-
.HeadlessMode(true)
11+
.WriteToJsonFile("output.json")
12+
.PageCrawlLimit(10)
2313
.WithParallelismDegree(30)
14+
.LogToConsole()
2415
.BuildAsync();
2516

2617
await engine.RunAsync();

README.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,21 @@ Parsing single page applications is super simple, just use the *GetWithBrowser*
7676
case Puppeteer will be used to load the pages.
7777

7878
```C#
79+
using WebReaper.Builders;
80+
7981
var engine = await new ScraperEngineBuilder()
80-
.GetWithBrowser("https://www.reddit.com/r/dotnet/")
81-
.Follow("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE")
82+
.GetWithBrowser("https://www.alexpavlov.dev/blog")
83+
.FollowWithBrowser(".text-gray-900.transition")
8284
.Parse(new()
8385
{
84-
new("title", "._eYtD2XCVieq6emjKBH3m"),
85-
new("text", "._3xX726aBn29LDbsDtzr_6E._1Ap4F5maDtT1E1YuCiaO0r.D3IL3FD0RFy_mkKLPwL4")
86+
new("title", ".text-3xl.font-bold"),
87+
new("text", ".max-w-max.prose.prose-dark")
8688
})
8789
.WriteToJsonFile("output.json")
90+
.PageCrawlLimit(10)
91+
.WithParallelismDegree(30)
8892
.LogToConsole()
89-
.BuildAsync()
93+
.BuildAsync();
9094

9195
await engine.RunAsync();
9296
```
@@ -120,7 +124,7 @@ It can be helpful if the required content is loaded only after some user interac
120124

121125
### Persist the progress locally
122126

123-
If you want to persist the vistited links and job queue locally, so that you can start crawling where you left off you
127+
If you want to persist the visited links and job queue locally, so that you can start crawling where you left off you
124128
can use *ScheduleWithTextFile* and *TrackVisitedLinksInFile* methods:
125129

126130
```C#

WebReaper/Builders/ScraperEngineBuilder.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ public ScraperEngineBuilder WriteToCsvFile(string filePath, bool dataCleanupOnSt
155155
return this;
156156
}
157157

158-
public ScraperEngineBuilder WriteToJsonFile(string filePath, bool dataCleanupOnStart)
158+
public ScraperEngineBuilder WriteToJsonFile(string filePath, bool dataCleanupOnStart = true)
159159
{
160160
SpiderBuilder.WriteToJsonFile(filePath, dataCleanupOnStart);
161161
return this;
@@ -186,6 +186,12 @@ public ScraperEngineBuilder GetWithBrowser(
186186
ConfigBuilder.GetWithBrowser(startUrls, actionBuilder?.Invoke(new PageActionBuilder()));
187187
return this;
188188
}
189+
190+
public ScraperEngineBuilder GetWithBrowser(params string[] startUrls)
191+
{
192+
ConfigBuilder.GetWithBrowser(startUrls);
193+
return this;
194+
}
189195

190196
public ScraperEngineBuilder Follow(string linkSelector)
191197
{

WebReaper/Core/ScraperEngine.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ await Parallel.ForEachAsync(Scheduler.GetAllAsync(cancellationToken), options, a
7676
catch (PageCrawlLimitException ex)
7777
{
7878
Logger.LogWarning(ex, "Shutting down due to page crawl limit {Limit}", ex.PageCrawlLimit);
79-
throw;
8079
}
8180
catch (TaskCanceledException ex)
8281
{

WebReaper/Core/Spider/Concrete/Spider.cs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,15 +58,7 @@ public async Task<List<Job>> CrawlAsync(Job job, CancellationToken cancellationT
5858

5959
if (config.UrlBlackList.Contains(job.Url)) return Enumerable.Empty<Job>().ToList();
6060

61-
if (await LinkTracker.GetVisitedLinksCount() >= config.PageCrawlLimit)
62-
{
63-
Logger.LogInformation("Page crawl limit has been reached");
64-
65-
throw new PageCrawlLimitException("Page crawl limit has been reached.")
66-
{
67-
PageCrawlLimit = config.PageCrawlLimit
68-
};
69-
}
61+
await CheckCrawlLimit(config);
7062

7163
await LinkTracker.AddVisitedLinkAsync(job.Url);
7264

@@ -80,6 +72,8 @@ public async Task<List<Job>> CrawlAsync(Job job, CancellationToken cancellationT
8072
if (job.PageCategory == PageCategory.TargetPage)
8173
{
8274
await ProcessTargetPage(job, doc, cancellationToken);
75+
76+
await CheckCrawlLimit(config);
8377

8478
return Enumerable.Empty<Job>().ToList();
8579
}
@@ -108,6 +102,19 @@ public async Task<List<Job>> CrawlAsync(Job job, CancellationToken cancellationT
108102
return newJobs;
109103
}
110104

105+
private async Task CheckCrawlLimit(ScraperConfig config)
106+
{
107+
if (await LinkTracker.GetVisitedLinksCount() >= config.PageCrawlLimit)
108+
{
109+
Logger.LogInformation("Page crawl limit has been reached");
110+
111+
throw new PageCrawlLimitException("Page crawl limit has been reached.")
112+
{
113+
PageCrawlLimit = config.PageCrawlLimit
114+
};
115+
}
116+
}
117+
111118
public event Action<ParsedData>? ScrapedData;
112119

113120
public event Func<Metadata, JObject, Task>? PostProcessor;

0 commit comments

Comments
 (0)