crawler

package
v0.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 2, 2026 License: AGPL-3.0 Imports: 24 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type CrawlRequest

type CrawlRequest struct {
	Seeds               []string `json:"seeds"`
	MaxPages            int      `json:"max_pages"`
	MaxDepth            int      `json:"max_depth"`
	Workers             int      `json:"workers"`
	Delay               string   `json:"delay"`
	StoreHTML           bool     `json:"store_html"`
	CrawlScope          string   `json:"crawl_scope"`
	ProjectID           *string  `json:"project_id"`
	CheckExternalLinks  *bool    `json:"check_external_links"`
	ExternalLinkWorkers int      `json:"external_link_workers"`
	RetryStatusCode     int      `json:"retry_status_code"`
	UserAgent           string   `json:"user_agent"`
	CrawlSitemapOnly    bool     `json:"crawl_sitemap_only"`
	CheckPageResources  *bool    `json:"check_page_resources"`
	ResourceWorkers     int      `json:"resource_workers"`
	TLSProfile          string   `json:"tls_profile"`
	JSRenderMode        string   `json:"js_render_mode"`
	JSRenderMaxPages    int      `json:"js_render_max_pages"`
	JSRenderTimeout     string   `json:"js_render_timeout"`
	FollowJSLinks       bool     `json:"follow_js_links"`
	SourceIP            string   `json:"source_ip"`
	ForceIPv4           bool     `json:"force_ipv4"`
}

CrawlRequest holds parameters for starting a new crawl.

type Engine

type Engine struct {
	// contains filtered or unexported fields
}

Engine orchestrates the crawling pipeline.

func NewEngine

func NewEngine(cfg *config.Config, store *storage.Store) *Engine

NewEngine creates a new crawl engine.

func (*Engine) BufferState

func (e *Engine) BufferState() storage.BufferErrorState

BufferState returns the current buffer error state for monitoring.

func (*Engine) PagesCrawled

func (e *Engine) PagesCrawled() int64

PagesCrawled returns the current number of pages crawled.

func (*Engine) PreSeedDedup

func (e *Engine) PreSeedDedup(urls []string)

PreSeedDedup adds URLs to the dedup database without adding them to the queue. Used when resuming a session to avoid re-crawling already visited URLs.

func (*Engine) QueueLen

func (e *Engine) QueueLen() int

QueueLen returns the current frontier queue length.

func (*Engine) ResumeSession

func (e *Engine) ResumeSession(id string, originalSeeds []string)

ResumeSession prepares the engine to resume an existing session.

func (*Engine) Run

func (e *Engine) Run(seeds []string) error

Run starts the crawl with the given seed URLs.

func (*Engine) SessionID

func (e *Engine) SessionID(seeds []string) string

SessionID creates the session and returns its ID without starting the crawl.

func (*Engine) SetSessionID

func (e *Engine) SetSessionID(id string)

SetSessionID sets a pre-existing session ID (for resume).

func (*Engine) Stop

func (e *Engine) Stop()

Stop gracefully stops the engine.

type HostHealth

type HostHealth struct {
	// contains filtered or unexported fields
}

HostHealth tracks success/failure rates per host.

func NewHostHealth

func NewHostHealth() *HostHealth

NewHostHealth creates a new HostHealth tracker.

func (*HostHealth) GlobalErrorRate

func (hh *HostHealth) GlobalErrorRate() float64

GlobalErrorRate returns the global error rate across all hosts.

func (*HostHealth) RecordFailure

func (hh *HostHealth) RecordFailure(host string)

RecordFailure records a failed fetch for a host.

func (*HostHealth) RecordSuccess

func (hh *HostHealth) RecordSuccess(host string)

RecordSuccess records a successful fetch for a host.

func (*HostHealth) ShouldRetry

func (hh *HostHealth) ShouldRetry(host string, maxConsecutiveFails int) bool

ShouldRetry returns false if the host has exceeded the consecutive failure threshold.

type Manager

type Manager struct {
	// contains filtered or unexported fields
}

Manager manages running crawl engines.

func NewManager

func NewManager(cfg *config.Config, store *storage.Store) *Manager

NewManager creates a new crawl manager.

func (*Manager) ActiveSessions

func (m *Manager) ActiveSessions() []string

ActiveSessions returns IDs of currently running sessions.

func (*Manager) BufferState

func (m *Manager) BufferState(sessionID string) storage.BufferErrorState

BufferState returns the buffer error state for a running session.

func (*Manager) IsQueued

func (m *Manager) IsQueued(sessionID string) bool

IsQueued returns true if the session is waiting in the queue.

func (*Manager) IsRunning

func (m *Manager) IsRunning(sessionID string) bool

IsRunning checks if a session is currently running.

func (*Manager) LastError

func (m *Manager) LastError(sessionID string) string

LastError returns the error message from the last run of a session, if any.

func (*Manager) Progress

func (m *Manager) Progress(sessionID string) (int64, int, bool)

Progress returns current crawl progress for a running session.

func (*Manager) QueuedSessions

func (m *Manager) QueuedSessions() []string

QueuedSessions returns the IDs of sessions waiting in the queue.

func (*Manager) RecoverOrphanedSessions

func (m *Manager) RecoverOrphanedSessions(ctx context.Context)

RecoverOrphanedSessions marks any sessions still in "running" status as "crashed". Should be called at startup to clean up after a previous unclean shutdown.

func (*Manager) ResumeCrawl

func (m *Manager) ResumeCrawl(sessionID string, overrides *CrawlRequest) (string, error)

ResumeCrawl resumes a stopped/completed session by re-crawling undiscovered links. If overrides is non-nil, its non-zero fields override the default config.

func (*Manager) RetryFailed

func (m *Manager) RetryFailed(sessionID string, overrides *CrawlRequest) (int, error)

RetryFailed retries pages with status_code = 0 (fetch errors) or a specific status code. Deletes the failed rows, then runs a mini-crawl with those URLs.

func (*Manager) Shutdown

func (m *Manager) Shutdown(timeout time.Duration)

Shutdown gracefully stops all running engines within the given timeout. Engines still running after the timeout are marked as "crashed". Queued sessions are marked as "stopped".

func (*Manager) StartCrawl

func (m *Manager) StartCrawl(req CrawlRequest) (string, error)

StartCrawl launches a new crawl session in background. Returns the session ID. If all semaphore slots are taken, the crawl is queued and starts automatically when a slot becomes available.

func (*Manager) StopCrawl

func (m *Manager) StopCrawl(sessionID string) error

StopCrawl stops a running crawl session or removes it from the queue.

type RetryItem

type RetryItem struct {
	URL      string
	Host     string
	Depth    int
	FoundOn  string
	Attempt  int
	ReadyAt  time.Time
	LastCode int
	LastErr  string
	// contains filtered or unexported fields
}

RetryItem represents a URL waiting to be retried.

type RetryPolicy

type RetryPolicy struct {
	MaxRetries int
	BaseDelay  time.Duration
	MaxDelay   time.Duration
}

RetryPolicy decides whether a failed request should be retried and computes delays.

func (*RetryPolicy) ComputeDelay

func (p *RetryPolicy) ComputeDelay(attempt int, retryAfterHeader string) time.Duration

ComputeDelay calculates the delay before the next retry attempt. If a Retry-After header is present, it takes priority.

func (*RetryPolicy) ShouldRetry

func (p *RetryPolicy) ShouldRetry(statusCode int, errString string, attempt int) bool

ShouldRetry returns true if the request should be retried based on status code, error string, and current attempt number.

type RetryQueue

type RetryQueue struct {
	// contains filtered or unexported fields
}

RetryQueue is a thread-safe min-heap of RetryItems ordered by ReadyAt.

func NewRetryQueue

func NewRetryQueue() *RetryQueue

NewRetryQueue creates a new empty RetryQueue.

func (*RetryQueue) Len

func (rq *RetryQueue) Len() int

Len returns the number of items in the queue.

func (*RetryQueue) PopReady

func (rq *RetryQueue) PopReady() *RetryItem

PopReady returns the next item whose ReadyAt is in the past, or nil if none are ready.

func (*RetryQueue) Push

func (rq *RetryQueue) Push(item *RetryItem)

Push adds an item to the retry queue.

type Session

type Session struct {
	ID        string
	StartedAt time.Time
	SeedURLs  []string
	Config    *config.Config
	Status    string
	Pages     uint64
	ProjectID *string
}

Session represents a single crawl session lifecycle.

func NewSession

func NewSession(seeds []string, cfg *config.Config) *Session

NewSession creates a new crawl session.

func (*Session) ToStorageRow

func (s *Session) ToStorageRow() *storage.CrawlSession

ToStorageRow converts a Session to a storage model.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL