Documentation
¶
Index ¶
- type CrawlRequest
- type Engine
- func (e *Engine) BufferState() storage.BufferErrorState
- func (e *Engine) PagesCrawled() int64
- func (e *Engine) PreSeedDedup(urls []string)
- func (e *Engine) QueueLen() int
- func (e *Engine) ResumeSession(id string, originalSeeds []string)
- func (e *Engine) Run(seeds []string) error
- func (e *Engine) SessionID(seeds []string) string
- func (e *Engine) SetSessionID(id string)
- func (e *Engine) Stop()
- type HostHealth
- type Manager
- func (m *Manager) ActiveSessions() []string
- func (m *Manager) BufferState(sessionID string) storage.BufferErrorState
- func (m *Manager) IsQueued(sessionID string) bool
- func (m *Manager) IsRunning(sessionID string) bool
- func (m *Manager) LastError(sessionID string) string
- func (m *Manager) Progress(sessionID string) (int64, int, bool)
- func (m *Manager) QueuedSessions() []string
- func (m *Manager) RecoverOrphanedSessions(ctx context.Context)
- func (m *Manager) ResumeCrawl(sessionID string, overrides *CrawlRequest) (string, error)
- func (m *Manager) RetryFailed(sessionID string, overrides *CrawlRequest) (int, error)
- func (m *Manager) Shutdown(timeout time.Duration)
- func (m *Manager) StartCrawl(req CrawlRequest) (string, error)
- func (m *Manager) StopCrawl(sessionID string) error
- type RetryItem
- type RetryPolicy
- type RetryQueue
- type Session
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CrawlRequest ¶
type CrawlRequest struct {
Seeds []string `json:"seeds"`
MaxPages int `json:"max_pages"`
MaxDepth int `json:"max_depth"`
Workers int `json:"workers"`
Delay string `json:"delay"`
StoreHTML bool `json:"store_html"`
CrawlScope string `json:"crawl_scope"`
ProjectID *string `json:"project_id"`
CheckExternalLinks *bool `json:"check_external_links"`
ExternalLinkWorkers int `json:"external_link_workers"`
RetryStatusCode int `json:"retry_status_code"`
UserAgent string `json:"user_agent"`
CrawlSitemapOnly bool `json:"crawl_sitemap_only"`
CheckPageResources *bool `json:"check_page_resources"`
ResourceWorkers int `json:"resource_workers"`
TLSProfile string `json:"tls_profile"`
JSRenderMode string `json:"js_render_mode"`
JSRenderMaxPages int `json:"js_render_max_pages"`
JSRenderTimeout string `json:"js_render_timeout"`
FollowJSLinks bool `json:"follow_js_links"`
SourceIP string `json:"source_ip"`
ForceIPv4 bool `json:"force_ipv4"`
}
CrawlRequest holds parameters for starting a new crawl.
type Engine ¶
type Engine struct {
// contains filtered or unexported fields
}
Engine orchestrates the crawling pipeline.
func (*Engine) BufferState ¶
func (e *Engine) BufferState() storage.BufferErrorState
BufferState returns the current buffer error state for monitoring.
func (*Engine) PagesCrawled ¶
PagesCrawled returns the current number of pages crawled.
func (*Engine) PreSeedDedup ¶
PreSeedDedup adds URLs to the dedup database without adding them to the queue. Used when resuming a session to avoid re-crawling already visited URLs.
func (*Engine) ResumeSession ¶
ResumeSession prepares the engine to resume an existing session.
func (*Engine) SessionID ¶
SessionID creates the session and returns its ID without starting the crawl.
func (*Engine) SetSessionID ¶
SetSessionID sets a pre-existing session ID (for resume).
type HostHealth ¶
type HostHealth struct {
// contains filtered or unexported fields
}
HostHealth tracks success/failure rates per host.
func NewHostHealth ¶
func NewHostHealth() *HostHealth
NewHostHealth creates a new HostHealth tracker.
func (*HostHealth) GlobalErrorRate ¶
func (hh *HostHealth) GlobalErrorRate() float64
GlobalErrorRate returns the global error rate across all hosts.
func (*HostHealth) RecordFailure ¶
func (hh *HostHealth) RecordFailure(host string)
RecordFailure records a failed fetch for a host.
func (*HostHealth) RecordSuccess ¶
func (hh *HostHealth) RecordSuccess(host string)
RecordSuccess records a successful fetch for a host.
func (*HostHealth) ShouldRetry ¶
func (hh *HostHealth) ShouldRetry(host string, maxConsecutiveFails int) bool
ShouldRetry returns false if the host has exceeded the consecutive failure threshold.
type Manager ¶
type Manager struct {
// contains filtered or unexported fields
}
Manager manages running crawl engines.
func NewManager ¶
NewManager creates a new crawl manager.
func (*Manager) ActiveSessions ¶
ActiveSessions returns IDs of currently running sessions.
func (*Manager) BufferState ¶
func (m *Manager) BufferState(sessionID string) storage.BufferErrorState
BufferState returns the buffer error state for a running session.
func (*Manager) LastError ¶
LastError returns the error message from the last run of a session, if any.
func (*Manager) QueuedSessions ¶
QueuedSessions returns the IDs of sessions waiting in the queue.
func (*Manager) RecoverOrphanedSessions ¶
RecoverOrphanedSessions marks any sessions still in "running" status as "crashed". Should be called at startup to clean up after a previous unclean shutdown.
func (*Manager) ResumeCrawl ¶
func (m *Manager) ResumeCrawl(sessionID string, overrides *CrawlRequest) (string, error)
ResumeCrawl resumes a stopped/completed session by re-crawling undiscovered links. If overrides is non-nil, its non-zero fields override the default config.
func (*Manager) RetryFailed ¶
func (m *Manager) RetryFailed(sessionID string, overrides *CrawlRequest) (int, error)
RetryFailed retries pages with status_code = 0 (fetch errors) or a specific status code. Deletes the failed rows, then runs a mini-crawl with those URLs.
func (*Manager) Shutdown ¶
Shutdown gracefully stops all running engines within the given timeout. Engines still running after the timeout are marked as "crashed". Queued sessions are marked as "stopped".
func (*Manager) StartCrawl ¶
func (m *Manager) StartCrawl(req CrawlRequest) (string, error)
StartCrawl launches a new crawl session in background. Returns the session ID. If all semaphore slots are taken, the crawl is queued and starts automatically when a slot becomes available.
type RetryItem ¶
type RetryItem struct {
URL string
Host string
Depth int
FoundOn string
Attempt int
ReadyAt time.Time
LastCode int
LastErr string
// contains filtered or unexported fields
}
RetryItem represents a URL waiting to be retried.
type RetryPolicy ¶
RetryPolicy decides whether a failed request should be retried and computes delays.
func (*RetryPolicy) ComputeDelay ¶
func (p *RetryPolicy) ComputeDelay(attempt int, retryAfterHeader string) time.Duration
ComputeDelay calculates the delay before the next retry attempt. If a Retry-After header is present, it takes priority.
func (*RetryPolicy) ShouldRetry ¶
func (p *RetryPolicy) ShouldRetry(statusCode int, errString string, attempt int) bool
ShouldRetry returns true if the request should be retried based on status code, error string, and current attempt number.
type RetryQueue ¶
type RetryQueue struct {
// contains filtered or unexported fields
}
RetryQueue is a thread-safe min-heap of RetryItems ordered by ReadyAt.
func (*RetryQueue) Len ¶
func (rq *RetryQueue) Len() int
Len returns the number of items in the queue.
func (*RetryQueue) PopReady ¶
func (rq *RetryQueue) PopReady() *RetryItem
PopReady returns the next item whose ReadyAt is in the past, or nil if none are ready.
func (*RetryQueue) Push ¶
func (rq *RetryQueue) Push(item *RetryItem)
Push adds an item to the retry queue.
type Session ¶
type Session struct {
ID string
StartedAt time.Time
SeedURLs []string
Config *config.Config
Status string
Pages uint64
ProjectID *string
}
Session represents a single crawl session lifecycle.
func NewSession ¶
NewSession creates a new crawl session.
func (*Session) ToStorageRow ¶
func (s *Session) ToStorageRow() *storage.CrawlSession
ToStorageRow converts a Session to a storage model.