scrape

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 1, 2025 License: MIT Imports: 26 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	ErrRobotsDenied     = &ScrapeError{Type: "robots_denied", Message: "robots.txt disallows this path"}
	ErrTimeout          = &ScrapeError{Type: "timeout", Message: "request timeout"}
	ErrTooManyRedirects = &ScrapeError{Type: "too_many_redirects", Message: "exceeded maximum redirect limit"}
	ErrRetryExhausted   = &ScrapeError{Type: "retry_exhausted", Message: "maximum retry attempts exceeded"}
	ErrRateLimited      = &ScrapeError{Type: "rate_limited", Message: "rate limit exceeded"}
	ErrCircuitOpen      = &ScrapeError{Type: "circuit_open", Message: "circuit breaker is open"}
	ErrInvalidURL       = &ScrapeError{Type: "invalid_url", Message: "invalid URL format"}
	ErrContentTooLarge  = &ScrapeError{Type: "content_too_large", Message: "response content exceeds size limit"}

	// Parse-specific errors
	ErrNoQuoteSummary   = &ScrapeError{Type: "no_quote_summary", Message: "could not locate quoteSummary script payload"}
	ErrJSONUnescape     = &ScrapeError{Type: "json_unescape", Message: "failed to unescape JSON from envelope body"}
	ErrJSONDecode       = &ScrapeError{Type: "json_decode", Message: "failed to decode JSON structure"}
	ErrMissingFieldBase = &ScrapeError{Type: "missing_field", Message: "required field is missing"}
	ErrSchemaDriftBase  = &ScrapeError{Type: "schema_drift", Message: "unexpected schema change detected"}

	// News-specific errors
	ErrNewsNoArticles = &ScrapeError{Type: "news_no_articles", Message: "no news articles found"}
	ErrNewsParse      = &ScrapeError{Type: "news_parse", Message: "failed to parse news HTML"}
)

Predefined error types

Functions

func IsRetryableError

func IsRetryableError(err error) bool

IsRetryableError determines if an error should trigger a retry

func IsValidRobotsPolicy

func IsValidRobotsPolicy(policy string) bool

IsValidRobotsPolicy checks if a robots policy is valid

func LoadAnalysisRegexConfig

func LoadAnalysisRegexConfig() error

LoadAnalysisRegexConfig loads the regex patterns from YAML file

func LoadAnalystInsightsRegexConfig

func LoadAnalystInsightsRegexConfig() error

LoadAnalystInsightsRegexConfig loads the regex patterns from YAML file

func LoadFinancialsRegexConfig

func LoadFinancialsRegexConfig() error

LoadFinancialsRegexConfig loads the regex patterns from YAML file

func LoadNewsRegexConfig

func LoadNewsRegexConfig() error

LoadNewsRegexConfig loads the news regex patterns from YAML file

func LoadRegexConfig

func LoadRegexConfig() error

LoadRegexConfig loads the regex patterns from YAML file

func NewClient

func NewClient(config *Config, httpxPool *httpx.Client) *client

NewClient creates a new scraping client

func ParseNews

func ParseNews(html []byte, baseURL string, now time.Time) ([]NewsItem, *NewsStats, error)

ParseNews extracts news articles from HTML with robust error handling and deduplication

func ParseYahooDate

func ParseYahooDate(ts any) (time.Time, bool)

ParseYahooDate parses various Yahoo date formats

func ParseYahooPeriod

func ParseYahooPeriod(periodStr string) (time.Time, time.Time, bool)

ParseYahooPeriod parses Yahoo's period format (e.g., "2023-12-31")

func StringToFloat64

func StringToFloat64(s string) (float64, bool)

StringToFloat64 safely converts a string to float64

func StringToInt64

func StringToInt64(s string) (int64, bool)

StringToInt64 safely converts a string to int64

Types

type AnalysisDTO

type AnalysisDTO struct {
	Symbol       string           `json:"symbol"`
	Market       string           `json:"market"`
	Currency     Currency         `json:"currency"`
	RecTrends    []Recommendation `json:"rec_trends"`
	EPSQuarterly []QuarterlyEPS   `json:"eps_quarterly"`
	AsOf         time.Time        `json:"as_of"`
}

AnalysisDTO represents extracted analysis data

type AnalysisRegexConfig

type AnalysisRegexConfig struct {
	EarningsEstimate struct {
		SectionPattern  string `yaml:"section_pattern"`
		CurrencyPattern string `yaml:"currency_pattern"`
		TableRowPattern string `yaml:"table_row_pattern"`
	} `yaml:"earnings_estimate"`

	RevenueEstimate struct {
		SectionPattern  string `yaml:"section_pattern"`
		CurrencyPattern string `yaml:"currency_pattern"`
		TableRowPattern string `yaml:"table_row_pattern"`
	} `yaml:"revenue_estimate"`

	EarningsHistory struct {
		SectionPattern   string `yaml:"section_pattern"`
		CurrencyPattern  string `yaml:"currency_pattern"`
		HeaderPattern    string `yaml:"header_pattern"`
		TableRowPattern  string `yaml:"table_row_pattern"`
		TableCellPattern string `yaml:"table_cell_pattern"`
	} `yaml:"earnings_history"`

	EPSTrend struct {
		SectionPattern  string `yaml:"section_pattern"`
		CurrencyPattern string `yaml:"currency_pattern"`
		TableRowPattern string `yaml:"table_row_pattern"`
	} `yaml:"eps_trend"`

	EPSRevisions struct {
		SectionPattern  string `yaml:"section_pattern"`
		CurrencyPattern string `yaml:"currency_pattern"`
		TableRowPattern string `yaml:"table_row_pattern"`
	} `yaml:"eps_revisions"`

	GrowthEstimate struct {
		SectionPattern  string `yaml:"section_pattern"`
		TableRowPattern string `yaml:"table_row_pattern"`
	} `yaml:"growth_estimate"`
}

AnalysisRegexConfig holds the regex patterns for analysis extraction

type AnalystInsightsDTO

type AnalystInsightsDTO struct {
	Symbol string    `json:"symbol"`
	Market string    `json:"market"`
	AsOf   time.Time `json:"as_of"`

	// Price Targets
	CurrentPrice      *float64 `json:"current_price,omitempty"`
	TargetMeanPrice   *float64 `json:"target_mean_price,omitempty"`
	TargetMedianPrice *float64 `json:"target_median_price,omitempty"`
	TargetHighPrice   *float64 `json:"target_high_price,omitempty"`
	TargetLowPrice    *float64 `json:"target_low_price,omitempty"`

	// Analyst Opinions
	NumberOfAnalysts   *int     `json:"number_of_analysts,omitempty"`
	RecommendationMean *float64 `json:"recommendation_mean,omitempty"`
	RecommendationKey  *string  `json:"recommendation_key,omitempty"`
}

AnalystInsightsDTO represents analyst insights data from Yahoo Finance

func ParseAnalystInsights

func ParseAnalystInsights(html []byte, symbol, market string) (*AnalystInsightsDTO, error)

ParseAnalystInsights parses analyst insights data from Yahoo Finance HTML

type AnalystInsightsRegexConfig

type AnalystInsightsRegexConfig struct {
	FinancialData struct {
		CombinedPattern string `yaml:"combined_pattern"`
	} `yaml:"financial_data"`

	IndividualFields struct {
		CurrentPrice       string `yaml:"current_price"`
		TargetMeanPrice    string `yaml:"target_mean_price"`
		TargetMedianPrice  string `yaml:"target_median_price"`
		TargetHighPrice    string `yaml:"target_high_price"`
		TargetLowPrice     string `yaml:"target_low_price"`
		RecommendationMean string `yaml:"recommendation_mean"`
		RecommendationKey  string `yaml:"recommendation_key"`
		NumberOfAnalysts   string `yaml:"number_of_analysts"`
	} `yaml:"individual_fields"`
}

AnalystInsightsRegexConfig holds the regex patterns for analyst insights extraction

type BackoffPolicy

type BackoffPolicy struct {
	BaseDelay    time.Duration
	MaxDelay     time.Duration
	Multiplier   float64
	JitterFactor float64
}

BackoffPolicy implements exponential backoff with jitter

func DefaultBackoffPolicy

func DefaultBackoffPolicy() *BackoffPolicy

DefaultBackoffPolicy returns a sensible default backoff policy

func NewBackoffPolicy

func NewBackoffPolicy(baseDelay, maxDelay time.Duration, multiplier, jitterFactor float64) *BackoffPolicy

NewBackoffPolicy creates a new backoff policy with custom parameters

func (*BackoffPolicy) CalculateDelay

func (bp *BackoffPolicy) CalculateDelay(attempt int) time.Duration

CalculateDelay calculates the backoff delay for a given attempt

func (*BackoffPolicy) CalculateDelayWithRetryAfter

func (bp *BackoffPolicy) CalculateDelayWithRetryAfter(attempt int, retryAfter time.Duration) time.Duration

CalculateDelayWithRetryAfter calculates backoff delay considering Retry-After header

func (*BackoffPolicy) CalculateDelays

func (bp *BackoffPolicy) CalculateDelays(maxAttempts int) []time.Duration

CalculateDelays calculates delays for multiple attempts (useful for testing)

func (*BackoffPolicy) GetStats

func (bp *BackoffPolicy) GetStats() map[string]interface{}

GetStats returns statistics about the backoff policy

func (*BackoffPolicy) Validate

func (bp *BackoffPolicy) Validate() error

Validate validates the backoff policy parameters

type BackoffPolicyConfig

type BackoffPolicyConfig struct {
	BaseDelay    time.Duration
	MaxDelay     time.Duration
	Multiplier   float64
	JitterFactor float64
}

BackoffPolicyConfig represents backoff configuration

func DefaultBackoffPolicyConfig

func DefaultBackoffPolicyConfig() *BackoffPolicyConfig

DefaultBackoffPolicyConfig returns a sensible default backoff policy

type Client

type Client interface {
	Fetch(ctx context.Context, url string) ([]byte, *FetchMeta, error)
}

Client interface for web scraping operations

type ColumnPatterns

type ColumnPatterns struct {
	MarketCap              string `yaml:"market_cap"`
	EnterpriseValue        string `yaml:"enterprise_value"`
	TrailingPE             string `yaml:"trailing_pe"`
	ForwardPE              string `yaml:"forward_pe"`
	PEGRatio               string `yaml:"peg_ratio"`
	PriceSales             string `yaml:"price_sales"`
	PriceBook              string `yaml:"price_book"`
	EnterpriseValueRevenue string `yaml:"enterprise_value_revenue"`
	EnterpriseValueEBITDA  string `yaml:"enterprise_value_ebitda"`
}

type ComprehensiveAnalysisDTO

type ComprehensiveAnalysisDTO struct {
	Symbol string    `json:"symbol"`
	Market string    `json:"market"`
	AsOf   time.Time `json:"as_of"`

	// Earnings Estimate
	EarningsEstimate struct {
		Currency   string `json:"currency"`
		CurrentQtr struct {
			NoOfAnalysts *int     `json:"no_of_analysts,omitempty"`
			AvgEstimate  *float64 `json:"avg_estimate,omitempty"`
			LowEstimate  *float64 `json:"low_estimate,omitempty"`
			HighEstimate *float64 `json:"high_estimate,omitempty"`
			YearAgoEPS   *float64 `json:"year_ago_eps,omitempty"`
		} `json:"current_qtr"`
		NextQtr struct {
			NoOfAnalysts *int     `json:"no_of_analysts,omitempty"`
			AvgEstimate  *float64 `json:"avg_estimate,omitempty"`
			LowEstimate  *float64 `json:"low_estimate,omitempty"`
			HighEstimate *float64 `json:"high_estimate,omitempty"`
			YearAgoEPS   *float64 `json:"year_ago_eps,omitempty"`
		} `json:"next_qtr"`
		CurrentYear struct {
			NoOfAnalysts *int     `json:"no_of_analysts,omitempty"`
			AvgEstimate  *float64 `json:"avg_estimate,omitempty"`
			LowEstimate  *float64 `json:"low_estimate,omitempty"`
			HighEstimate *float64 `json:"high_estimate,omitempty"`
			YearAgoEPS   *float64 `json:"year_ago_eps,omitempty"`
		} `json:"current_year"`
		NextYear struct {
			NoOfAnalysts *int     `json:"no_of_analysts,omitempty"`
			AvgEstimate  *float64 `json:"avg_estimate,omitempty"`
			LowEstimate  *float64 `json:"low_estimate,omitempty"`
			HighEstimate *float64 `json:"high_estimate,omitempty"`
			YearAgoEPS   *float64 `json:"year_ago_eps,omitempty"`
		} `json:"next_year"`
	} `json:"earnings_estimate"`

	// Revenue Estimate
	RevenueEstimate struct {
		Currency   string `json:"currency"`
		CurrentQtr struct {
			NoOfAnalysts       *int    `json:"no_of_analysts,omitempty"`
			AvgEstimate        *string `json:"avg_estimate,omitempty"` // Keep as string due to "B" suffix
			LowEstimate        *string `json:"low_estimate,omitempty"`
			HighEstimate       *string `json:"high_estimate,omitempty"`
			YearAgoSales       *string `json:"year_ago_sales,omitempty"`
			SalesGrowthYearEst *string `json:"sales_growth_year_est,omitempty"`
		} `json:"current_qtr"`
		NextQtr struct {
			NoOfAnalysts       *int    `json:"no_of_analysts,omitempty"`
			AvgEstimate        *string `json:"avg_estimate,omitempty"`
			LowEstimate        *string `json:"low_estimate,omitempty"`
			HighEstimate       *string `json:"high_estimate,omitempty"`
			YearAgoSales       *string `json:"year_ago_sales,omitempty"`
			SalesGrowthYearEst *string `json:"sales_growth_year_est,omitempty"`
		} `json:"next_qtr"`
		CurrentYear struct {
			NoOfAnalysts       *int    `json:"no_of_analysts,omitempty"`
			AvgEstimate        *string `json:"avg_estimate,omitempty"`
			LowEstimate        *string `json:"low_estimate,omitempty"`
			HighEstimate       *string `json:"high_estimate,omitempty"`
			YearAgoSales       *string `json:"year_ago_sales,omitempty"`
			SalesGrowthYearEst *string `json:"sales_growth_year_est,omitempty"`
		} `json:"current_year"`
		NextYear struct {
			NoOfAnalysts       *int    `json:"no_of_analysts,omitempty"`
			AvgEstimate        *string `json:"avg_estimate,omitempty"`
			LowEstimate        *string `json:"low_estimate,omitempty"`
			HighEstimate       *string `json:"high_estimate,omitempty"`
			YearAgoSales       *string `json:"year_ago_sales,omitempty"`
			SalesGrowthYearEst *string `json:"sales_growth_year_est,omitempty"`
		} `json:"next_year"`
	} `json:"revenue_estimate"`

	// Earnings History (dynamic dates)
	EarningsHistory struct {
		Currency string `json:"currency"`
		Data     []struct {
			Date            string   `json:"date"`
			EPSEst          *float64 `json:"eps_est,omitempty"`
			EPSActual       *float64 `json:"eps_actual,omitempty"`
			Difference      *float64 `json:"difference,omitempty"`
			SurprisePercent *string  `json:"surprise_percent,omitempty"`
		} `json:"data"`
	} `json:"earnings_history"`

	// EPS Trend
	EPSTrend struct {
		Currency   string `json:"currency"`
		CurrentQtr struct {
			CurrentEstimate *float64 `json:"current_estimate,omitempty"`
			Days7Ago        *float64 `json:"days_7_ago,omitempty"`
			Days30Ago       *float64 `json:"days_30_ago,omitempty"`
			Days60Ago       *float64 `json:"days_60_ago,omitempty"`
			Days90Ago       *float64 `json:"days_90_ago,omitempty"`
		} `json:"current_qtr"`
		NextQtr struct {
			CurrentEstimate *float64 `json:"current_estimate,omitempty"`
			Days7Ago        *float64 `json:"days_7_ago,omitempty"`
			Days30Ago       *float64 `json:"days_30_ago,omitempty"`
			Days60Ago       *float64 `json:"days_60_ago,omitempty"`
			Days90Ago       *float64 `json:"days_90_ago,omitempty"`
		} `json:"next_qtr"`
		CurrentYear struct {
			CurrentEstimate *float64 `json:"current_estimate,omitempty"`
			Days7Ago        *float64 `json:"days_7_ago,omitempty"`
			Days30Ago       *float64 `json:"days_30_ago,omitempty"`
			Days60Ago       *float64 `json:"days_60_ago,omitempty"`
			Days90Ago       *float64 `json:"days_90_ago,omitempty"`
		} `json:"current_year"`
		NextYear struct {
			CurrentEstimate *float64 `json:"current_estimate,omitempty"`
			Days7Ago        *float64 `json:"days_7_ago,omitempty"`
			Days30Ago       *float64 `json:"days_30_ago,omitempty"`
			Days60Ago       *float64 `json:"days_60_ago,omitempty"`
			Days90Ago       *float64 `json:"days_90_ago,omitempty"`
		} `json:"next_year"`
	} `json:"eps_trend"`

	// EPS Revisions
	EPSRevisions struct {
		Currency   string `json:"currency"`
		CurrentQtr struct {
			UpLast7Days    *int `json:"up_last_7_days,omitempty"`
			UpLast30Days   *int `json:"up_last_30_days,omitempty"`
			DownLast7Days  *int `json:"down_last_7_days,omitempty"`
			DownLast30Days *int `json:"down_last_30_days,omitempty"`
		} `json:"current_qtr"`
		NextQtr struct {
			UpLast7Days    *int `json:"up_last_7_days,omitempty"`
			UpLast30Days   *int `json:"up_last_30_days,omitempty"`
			DownLast7Days  *int `json:"down_last_7_days,omitempty"`
			DownLast30Days *int `json:"down_last_30_days,omitempty"`
		} `json:"next_qtr"`
		CurrentYear struct {
			UpLast7Days    *int `json:"up_last_7_days,omitempty"`
			UpLast30Days   *int `json:"up_last_30_days,omitempty"`
			DownLast7Days  *int `json:"down_last_7_days,omitempty"`
			DownLast30Days *int `json:"down_last_30_days,omitempty"`
		} `json:"current_year"`
		NextYear struct {
			UpLast7Days    *int `json:"up_last_7_days,omitempty"`
			UpLast30Days   *int `json:"up_last_30_days,omitempty"`
			DownLast7Days  *int `json:"down_last_7_days,omitempty"`
			DownLast30Days *int `json:"down_last_30_days,omitempty"`
		} `json:"next_year"`
	} `json:"eps_revisions"`

	// Growth Estimates (only ticker data, not S&P 500)
	GrowthEstimate struct {
		CurrentQtr  *string `json:"current_qtr,omitempty"`
		NextQtr     *string `json:"next_qtr,omitempty"`
		CurrentYear *string `json:"current_year,omitempty"`
		NextYear    *string `json:"next_year,omitempty"`
	} `json:"growth_estimate"`
}

ComprehensiveAnalysisDTO represents comprehensive analysis data from Yahoo Finance

func ParseAnalysis

func ParseAnalysis(html []byte, symbol, market string) (*ComprehensiveAnalysisDTO, error)

ParseAnalysis parses analysis data from Yahoo Finance HTML

type ComprehensiveFinancialsDTO

type ComprehensiveFinancialsDTO struct {
	Symbol   string    `json:"symbol"`
	Market   string    `json:"market"`
	Currency string    `json:"currency"`
	AsOf     time.Time `json:"as_of"`

	// Current values (most recent quarter)
	Current struct {
		TotalRevenue                         *Scaled `json:"total_revenue,omitempty"`
		CostOfRevenue                        *Scaled `json:"cost_of_revenue,omitempty"`
		GrossProfit                          *Scaled `json:"gross_profit,omitempty"`
		OperatingExpense                     *Scaled `json:"operating_expense,omitempty"`
		OperatingIncome                      *Scaled `json:"operating_income,omitempty"`
		NetNonOperatingInterestIncomeExpense *Scaled `json:"net_non_operating_interest_income_expense,omitempty"`
		OtherIncomeExpense                   *Scaled `json:"other_income_expense,omitempty"`
		PretaxIncome                         *Scaled `json:"pretax_income,omitempty"`
		TaxProvision                         *Scaled `json:"tax_provision,omitempty"`
		NetIncomeCommonStockholders          *Scaled `json:"net_income_common_stockholders,omitempty"`
		BasicEPS                             *Scaled `json:"basic_eps,omitempty"`
		DilutedEPS                           *Scaled `json:"diluted_eps,omitempty"`
		BasicAverageShares                   *int64  `json:"basic_average_shares,omitempty"`
		DilutedAverageShares                 *int64  `json:"diluted_average_shares,omitempty"`
		TotalExpenses                        *Scaled `json:"total_expenses,omitempty"`
		NormalizedIncome                     *Scaled `json:"normalized_income,omitempty"`
		EBIT                                 *Scaled `json:"ebit,omitempty"`
		EBITDA                               *Scaled `json:"ebitda,omitempty"`
		ReconciledCostOfRevenue              *Scaled `json:"reconciled_cost_of_revenue,omitempty"`
		ReconciledDepreciation               *Scaled `json:"reconciled_depreciation,omitempty"`
		NormalizedEBITDA                     *Scaled `json:"normalized_ebitda,omitempty"`

		// Balance Sheet fields
		TotalAssets             *Scaled `json:"total_assets,omitempty"`
		TotalCapitalization     *Scaled `json:"total_capitalization,omitempty"`
		CommonStockEquity       *Scaled `json:"common_stock_equity,omitempty"`
		CapitalLeaseObligations *Scaled `json:"capital_lease_obligations,omitempty"`
		NetTangibleAssets       *Scaled `json:"net_tangible_assets,omitempty"`
		WorkingCapital          *Scaled `json:"working_capital,omitempty"`
		InvestedCapital         *Scaled `json:"invested_capital,omitempty"`
		TangibleBookValue       *Scaled `json:"tangible_book_value,omitempty"`
		TotalDebt               *Scaled `json:"total_debt,omitempty"`
		ShareIssued             *int64  `json:"share_issued,omitempty"`

		// Cash Flow fields
		OperatingCashFlow        *Scaled `json:"operating_cash_flow,omitempty"`
		InvestingCashFlow        *Scaled `json:"investing_cash_flow,omitempty"`
		FinancingCashFlow        *Scaled `json:"financing_cash_flow,omitempty"`
		EndCashPosition          *Scaled `json:"end_cash_position,omitempty"`
		CapitalExpenditure       *Scaled `json:"capital_expenditure,omitempty"`
		IssuanceOfDebt           *Scaled `json:"issuance_of_debt,omitempty"`
		RepaymentOfDebt          *Scaled `json:"repayment_of_debt,omitempty"`
		RepurchaseOfCapitalStock *Scaled `json:"repurchase_of_capital_stock,omitempty"`
		FreeCashFlow             *Scaled `json:"free_cash_flow,omitempty"`
	} `json:"current"`

	// Historical values
	Historical struct {
		Q2_2025 struct {
			Date                                 string  `json:"date"`
			TotalRevenue                         *Scaled `json:"total_revenue,omitempty"`
			CostOfRevenue                        *Scaled `json:"cost_of_revenue,omitempty"`
			GrossProfit                          *Scaled `json:"gross_profit,omitempty"`
			OperatingExpense                     *Scaled `json:"operating_expense,omitempty"`
			OperatingIncome                      *Scaled `json:"operating_income,omitempty"`
			NetNonOperatingInterestIncomeExpense *Scaled `json:"net_non_operating_interest_income_expense,omitempty"`
			OtherIncomeExpense                   *Scaled `json:"other_income_expense,omitempty"`
			PretaxIncome                         *Scaled `json:"pretax_income,omitempty"`
			TaxProvision                         *Scaled `json:"tax_provision,omitempty"`
			NetIncomeCommonStockholders          *Scaled `json:"net_income_common_stockholders,omitempty"`
			BasicEPS                             *Scaled `json:"basic_eps,omitempty"`
			DilutedEPS                           *Scaled `json:"diluted_eps,omitempty"`
			BasicAverageShares                   *int64  `json:"basic_average_shares,omitempty"`
			DilutedAverageShares                 *int64  `json:"diluted_average_shares,omitempty"`
			TotalExpenses                        *Scaled `json:"total_expenses,omitempty"`
			NormalizedIncome                     *Scaled `json:"normalized_income,omitempty"`
			EBIT                                 *Scaled `json:"ebit,omitempty"`
			EBITDA                               *Scaled `json:"ebitda,omitempty"`
			ReconciledCostOfRevenue              *Scaled `json:"reconciled_cost_of_revenue,omitempty"`
			ReconciledDepreciation               *Scaled `json:"reconciled_depreciation,omitempty"`
			NormalizedEBITDA                     *Scaled `json:"normalized_ebitda,omitempty"`
		} `json:"q2_2025"`

		Q1_2025 struct {
			Date                                 string  `json:"date"`
			TotalRevenue                         *Scaled `json:"total_revenue,omitempty"`
			CostOfRevenue                        *Scaled `json:"cost_of_revenue,omitempty"`
			GrossProfit                          *Scaled `json:"gross_profit,omitempty"`
			OperatingExpense                     *Scaled `json:"operating_expense,omitempty"`
			OperatingIncome                      *Scaled `json:"operating_income,omitempty"`
			NetNonOperatingInterestIncomeExpense *Scaled `json:"net_non_operating_interest_income_expense,omitempty"`
			OtherIncomeExpense                   *Scaled `json:"other_income_expense,omitempty"`
			PretaxIncome                         *Scaled `json:"pretax_income,omitempty"`
			TaxProvision                         *Scaled `json:"tax_provision,omitempty"`
			NetIncomeCommonStockholders          *Scaled `json:"net_income_common_stockholders,omitempty"`
			BasicEPS                             *Scaled `json:"basic_eps,omitempty"`
			DilutedEPS                           *Scaled `json:"diluted_eps,omitempty"`
			BasicAverageShares                   *int64  `json:"basic_average_shares,omitempty"`
			DilutedAverageShares                 *int64  `json:"diluted_average_shares,omitempty"`
			TotalExpenses                        *Scaled `json:"total_expenses,omitempty"`
			NormalizedIncome                     *Scaled `json:"normalized_income,omitempty"`
			EBIT                                 *Scaled `json:"ebit,omitempty"`
			EBITDA                               *Scaled `json:"ebitda,omitempty"`
			ReconciledCostOfRevenue              *Scaled `json:"reconciled_cost_of_revenue,omitempty"`
			ReconciledDepreciation               *Scaled `json:"reconciled_depreciation,omitempty"`
			NormalizedEBITDA                     *Scaled `json:"normalized_ebitda,omitempty"`
		} `json:"q1_2025"`

		Q4_2024 struct {
			Date                                 string  `json:"date"`
			TotalRevenue                         *Scaled `json:"total_revenue,omitempty"`
			CostOfRevenue                        *Scaled `json:"cost_of_revenue,omitempty"`
			GrossProfit                          *Scaled `json:"gross_profit,omitempty"`
			OperatingExpense                     *Scaled `json:"operating_expense,omitempty"`
			OperatingIncome                      *Scaled `json:"operating_income,omitempty"`
			NetNonOperatingInterestIncomeExpense *Scaled `json:"net_non_operating_interest_income_expense,omitempty"`
			OtherIncomeExpense                   *Scaled `json:"other_income_expense,omitempty"`
			PretaxIncome                         *Scaled `json:"pretax_income,omitempty"`
			TaxProvision                         *Scaled `json:"tax_provision,omitempty"`
			NetIncomeCommonStockholders          *Scaled `json:"net_income_common_stockholders,omitempty"`
			BasicEPS                             *Scaled `json:"basic_eps,omitempty"`
			DilutedEPS                           *Scaled `json:"diluted_eps,omitempty"`
			BasicAverageShares                   *int64  `json:"basic_average_shares,omitempty"`
			DilutedAverageShares                 *int64  `json:"diluted_average_shares,omitempty"`
			TotalExpenses                        *Scaled `json:"total_expenses,omitempty"`
			NormalizedIncome                     *Scaled `json:"normalized_income,omitempty"`
			EBIT                                 *Scaled `json:"ebit,omitempty"`
			EBITDA                               *Scaled `json:"ebitda,omitempty"`
			ReconciledCostOfRevenue              *Scaled `json:"reconciled_cost_of_revenue,omitempty"`
			ReconciledDepreciation               *Scaled `json:"reconciled_depreciation,omitempty"`
			NormalizedEBITDA                     *Scaled `json:"normalized_ebitda,omitempty"`
		} `json:"q4_2024"`

		Q3_2024 struct {
			Date                                 string  `json:"date"`
			TotalRevenue                         *Scaled `json:"total_revenue,omitempty"`
			CostOfRevenue                        *Scaled `json:"cost_of_revenue,omitempty"`
			GrossProfit                          *Scaled `json:"gross_profit,omitempty"`
			OperatingExpense                     *Scaled `json:"operating_expense,omitempty"`
			OperatingIncome                      *Scaled `json:"operating_income,omitempty"`
			NetNonOperatingInterestIncomeExpense *Scaled `json:"net_non_operating_interest_income_expense,omitempty"`
			OtherIncomeExpense                   *Scaled `json:"other_income_expense,omitempty"`
			PretaxIncome                         *Scaled `json:"pretax_income,omitempty"`
			TaxProvision                         *Scaled `json:"tax_provision,omitempty"`
			NetIncomeCommonStockholders          *Scaled `json:"net_income_common_stockholders,omitempty"`
			BasicEPS                             *Scaled `json:"basic_eps,omitempty"`
			DilutedEPS                           *Scaled `json:"diluted_eps,omitempty"`
			BasicAverageShares                   *int64  `json:"basic_average_shares,omitempty"`
			DilutedAverageShares                 *int64  `json:"diluted_average_shares,omitempty"`
			TotalExpenses                        *Scaled `json:"total_expenses,omitempty"`
			NormalizedIncome                     *Scaled `json:"normalized_income,omitempty"`
			EBIT                                 *Scaled `json:"ebit,omitempty"`
			EBITDA                               *Scaled `json:"ebitda,omitempty"`
			ReconciledCostOfRevenue              *Scaled `json:"reconciled_cost_of_revenue,omitempty"`
			ReconciledDepreciation               *Scaled `json:"reconciled_depreciation,omitempty"`
			NormalizedEBITDA                     *Scaled `json:"normalized_ebitda,omitempty"`
		} `json:"q3_2024"`

		Q2_2024 struct {
			Date                                 string  `json:"date"`
			TotalRevenue                         *Scaled `json:"total_revenue,omitempty"`
			CostOfRevenue                        *Scaled `json:"cost_of_revenue,omitempty"`
			GrossProfit                          *Scaled `json:"gross_profit,omitempty"`
			OperatingExpense                     *Scaled `json:"operating_expense,omitempty"`
			OperatingIncome                      *Scaled `json:"operating_income,omitempty"`
			NetNonOperatingInterestIncomeExpense *Scaled `json:"net_non_operating_interest_income_expense,omitempty"`
			OtherIncomeExpense                   *Scaled `json:"other_income_expense,omitempty"`
			PretaxIncome                         *Scaled `json:"pretax_income,omitempty"`
			TaxProvision                         *Scaled `json:"tax_provision,omitempty"`
			NetIncomeCommonStockholders          *Scaled `json:"net_income_common_stockholders,omitempty"`
			BasicEPS                             *Scaled `json:"basic_eps,omitempty"`
			DilutedEPS                           *Scaled `json:"diluted_eps,omitempty"`
			BasicAverageShares                   *int64  `json:"basic_average_shares,omitempty"`
			DilutedAverageShares                 *int64  `json:"diluted_average_shares,omitempty"`
			TotalExpenses                        *Scaled `json:"total_expenses,omitempty"`
			NormalizedIncome                     *Scaled `json:"normalized_income,omitempty"`
			EBIT                                 *Scaled `json:"ebit,omitempty"`
			EBITDA                               *Scaled `json:"ebitda,omitempty"`
			ReconciledCostOfRevenue              *Scaled `json:"reconciled_cost_of_revenue,omitempty"`
			ReconciledDepreciation               *Scaled `json:"reconciled_depreciation,omitempty"`
			NormalizedEBITDA                     *Scaled `json:"normalized_ebitda,omitempty"`
		} `json:"q2_2024"`
	} `json:"historical"`
}

ComprehensiveFinancialsDTO holds all financials data including historical

func ParseComprehensiveFinancials

func ParseComprehensiveFinancials(html []byte, symbol, market string) (*ComprehensiveFinancialsDTO, error)

ParseComprehensiveFinancials extracts comprehensive financials data from HTML using JSON parsing

func ParseComprehensiveFinancialsWithCurrency

func ParseComprehensiveFinancialsWithCurrency(html, financialsHTML []byte, symbol, market string) (*ComprehensiveFinancialsDTO, error)

ParseComprehensiveFinancialsWithCurrency parses financial data from one HTML source and currency from financials HTML

type ComprehensiveKeyStatisticsDTO

type ComprehensiveKeyStatisticsDTO struct {
	Symbol   string    `json:"symbol"`
	Market   string    `json:"market"`
	Currency string    `json:"currency"`
	AsOf     time.Time `json:"as_of"`

	// Current values (most recent data)
	Current struct {
		MarketCap              *Scaled `json:"market_cap,omitempty"`
		EnterpriseValue        *Scaled `json:"enterprise_value,omitempty"`
		TrailingPE             *Scaled `json:"trailing_pe,omitempty"`
		ForwardPE              *Scaled `json:"forward_pe,omitempty"`
		PEGRatio               *Scaled `json:"peg_ratio,omitempty"`
		PriceSales             *Scaled `json:"price_sales,omitempty"`
		PriceBook              *Scaled `json:"price_book,omitempty"`
		EnterpriseValueRevenue *Scaled `json:"enterprise_value_revenue,omitempty"`
		EnterpriseValueEBITDA  *Scaled `json:"enterprise_value_ebitda,omitempty"`
	} `json:"current"`

	// Additional statistics (from other parts of the page)
	Additional struct {
		Beta              *Scaled `json:"beta,omitempty"`
		SharesOutstanding *int64  `json:"shares_outstanding,omitempty"`
		ProfitMargin      *Scaled `json:"profit_margin,omitempty"`
		OperatingMargin   *Scaled `json:"operating_margin,omitempty"`
		ReturnOnAssets    *Scaled `json:"return_on_assets,omitempty"`
		ReturnOnEquity    *Scaled `json:"return_on_equity,omitempty"`
	} `json:"additional"`

	// Historical values - dynamic quarters
	Historical []HistoricalQuarter `json:"historical,omitempty"`
}

ComprehensiveKeyStatisticsDTO holds all key statistics data

func ParseComprehensiveKeyStatistics

func ParseComprehensiveKeyStatistics(html []byte, symbol, market string) (*ComprehensiveKeyStatisticsDTO, error)

ParseComprehensiveKeyStatistics extracts comprehensive key statistics data from HTML

type ComprehensiveProfileDTO

type ComprehensiveProfileDTO struct {
	Symbol string    `json:"symbol"`
	Market string    `json:"market"`
	AsOf   time.Time `json:"as_of"`

	// Company Information
	CompanyName       string `json:"company_name,omitempty"`
	ShortName         string `json:"short_name,omitempty"`
	Address1          string `json:"address1,omitempty"`
	City              string `json:"city,omitempty"`
	State             string `json:"state,omitempty"`
	Zip               string `json:"zip,omitempty"`
	Country           string `json:"country,omitempty"`
	Phone             string `json:"phone,omitempty"`
	Website           string `json:"website,omitempty"`
	Industry          string `json:"industry,omitempty"`
	Sector            string `json:"sector,omitempty"`
	FullTimeEmployees *int64 `json:"full_time_employees,omitempty"`
	BusinessSummary   string `json:"business_summary,omitempty"`

	// Key Executives
	Executives []Executive `json:"executives,omitempty"`

	// Additional Information
	MaxAge                    *int64 `json:"max_age,omitempty"`
	AuditRisk                 *int64 `json:"audit_risk,omitempty"`
	BoardRisk                 *int64 `json:"board_risk,omitempty"`
	CompensationRisk          *int64 `json:"compensation_risk,omitempty"`
	ShareHolderRightsRisk     *int64 `json:"share_holder_rights_risk,omitempty"`
	OverallRisk               *int64 `json:"overall_risk,omitempty"`
	GovernanceEpochDate       *int64 `json:"governance_epoch_date,omitempty"`
	CompensationAsOfEpochDate *int64 `json:"compensation_as_of_epoch_date,omitempty"`
}

ComprehensiveProfileDTO holds comprehensive profile data

func ParseComprehensiveProfile

func ParseComprehensiveProfile(html []byte, symbol, market string) (*ComprehensiveProfileDTO, error)

ParseComprehensiveProfile extracts comprehensive profile data from HTML using JSON parsing

type Config

type Config struct {
	Enabled      bool           `yaml:"enabled"`
	UserAgent    string         `yaml:"user_agent"`
	TimeoutMs    int            `yaml:"timeout_ms"`
	QPS          float64        `yaml:"qps"`
	Burst        int            `yaml:"burst"`
	Retry        RetryConfig    `yaml:"retry"`
	RobotsPolicy string         `yaml:"robots_policy"`
	CacheTTLMs   int            `yaml:"cache_ttl_ms"`
	Endpoints    EndpointConfig `yaml:"endpoints"`
}

Config represents the scraping configuration

func DefaultConfig

func DefaultConfig() *Config

DefaultConfig returns a sensible default configuration

type Currency

type Currency = string

Currency represents an ISO-4217 currency code

func CoerceCurrency

func CoerceCurrency(v any) (Currency, bool)

CoerceCurrency extracts currency from various Yahoo formats

type EndpointConfig

type EndpointConfig struct {
	KeyStatistics bool `yaml:"key_statistics"`
	Financials    bool `yaml:"financials"`
	Analysis      bool `yaml:"analysis"`
	Profile       bool `yaml:"profile"`
	News          bool `yaml:"news"`
}

EndpointConfig represents endpoint-specific configuration

type Executive

type Executive struct {
	Name             string `json:"name,omitempty"`
	Title            string `json:"title,omitempty"`
	YearBorn         *int   `json:"year_born,omitempty"`
	TotalPay         *int64 `json:"total_pay,omitempty"`
	ExercisedValue   *int64 `json:"exercised_value,omitempty"`
	UnexercisedValue *int64 `json:"unexercised_value,omitempty"`
}

Executive represents a company executive

type FetchMeta

type FetchMeta struct {
	URL          string        `json:"url"`
	Host         string        `json:"host"`
	Status       int           `json:"status"`
	Attempt      int           `json:"attempt"`
	Bytes        int           `json:"bytes"`
	Gzip         bool          `json:"gzip"`
	Redirects    int           `json:"redirects"`
	Duration     time.Duration `json:"duration"`
	FromCache    bool          `json:"from_cache"` // reserved for optional HTML in-run cache
	RobotsPolicy string        `json:"robots_policy"`
}

FetchMeta contains metadata about a fetch operation

type FinancialDataPoint

type FinancialDataPoint struct {
	DataID        int64  `json:"dataId"`
	AsOfDate      string `json:"asOfDate"`
	PeriodType    string `json:"periodType"`
	CurrencyCode  string `json:"currencyCode"`
	ReportedValue struct {
		Raw float64 `json:"raw"`
		Fmt string  `json:"fmt"`
	} `json:"reportedValue"`
}

FinancialDataPoint represents a single financial data point from Yahoo Finance

type FinancialsDTO

type FinancialsDTO struct {
	Symbol string       `json:"symbol"`
	Market string       `json:"market"`
	Lines  []PeriodLine `json:"lines"`
	AsOf   time.Time    `json:"as_of"`
}

FinancialsDTO represents extracted financial statements data

type FinancialsRegexConfig

type FinancialsRegexConfig struct {
	Currency struct {
		Pattern string `yaml:"pattern"`
	} `yaml:"currency"`

	IncomeStatement struct {
		TotalRevenue     string `yaml:"total_revenue"`
		CostOfRevenue    string `yaml:"cost_of_revenue"`
		OperatingIncome  string `yaml:"operating_income"`
		NetIncome        string `yaml:"net_income"`
		BasicEPS         string `yaml:"basic_eps"`
		DilutedEPS       string `yaml:"diluted_eps"`
		EBITDA           string `yaml:"ebitda"`
		EBIT             string `yaml:"ebit"`
		TotalExpenses    string `yaml:"total_expenses"`
		NormalizedEBITDA string `yaml:"normalized_ebitda"`
	} `yaml:"income_statement"`

	Shares struct {
		BasicAverageShares   string `yaml:"basic_average_shares"`
		DilutedAverageShares string `yaml:"diluted_average_shares"`
	} `yaml:"shares"`

	BalanceSheet struct {
		TotalAssets             string `yaml:"total_assets"`
		TotalCapitalization     string `yaml:"total_capitalization"`
		CommonStockEquity       string `yaml:"common_stock_equity"`
		CapitalLeaseObligations string `yaml:"capital_lease_obligations"`
		NetTangibleAssets       string `yaml:"net_tangible_assets"`
		WorkingCapital          string `yaml:"working_capital"`
		InvestedCapital         string `yaml:"invested_capital"`
		TangibleBookValue       string `yaml:"tangible_book_value"`
		TotalDebt               string `yaml:"total_debt"`
		ShareIssued             string `yaml:"share_issued"`
	} `yaml:"balance_sheet"`

	CashFlow struct {
		OperatingCashFlow        string `yaml:"operating_cash_flow"`
		InvestingCashFlow        string `yaml:"investing_cash_flow"`
		FinancingCashFlow        string `yaml:"financing_cash_flow"`
		EndCashPosition          string `yaml:"end_cash_position"`
		CapitalExpenditure       string `yaml:"capital_expenditure"`
		IssuanceOfDebt           string `yaml:"issuance_of_debt"`
		RepaymentOfDebt          string `yaml:"repayment_of_debt"`
		RepurchaseOfCapitalStock string `yaml:"repurchase_of_capital_stock"`
		FreeCashFlow             string `yaml:"free_cash_flow"`
	} `yaml:"cash_flow"`
}

FinancialsRegexConfig holds the regex patterns for financials extraction

type HistoricalQuarter

type HistoricalQuarter struct {
	Date                   string  `json:"date"`
	MarketCap              *Scaled `json:"market_cap,omitempty"`
	EnterpriseValue        *Scaled `json:"enterprise_value,omitempty"`
	TrailingPE             *Scaled `json:"trailing_pe,omitempty"`
	ForwardPE              *Scaled `json:"forward_pe,omitempty"`
	PEGRatio               *Scaled `json:"peg_ratio,omitempty"`
	PriceSales             *Scaled `json:"price_sales,omitempty"`
	PriceBook              *Scaled `json:"price_book,omitempty"`
	EnterpriseValueRevenue *Scaled `json:"enterprise_value_revenue,omitempty"`
	EnterpriseValueEBITDA  *Scaled `json:"enterprise_value_ebitda,omitempty"`
}

type InflightTracker

type InflightTracker struct {
	// contains filtered or unexported fields
}

InflightTracker tracks in-flight requests per host

func NewInflightTracker

func NewInflightTracker() *InflightTracker

NewInflightTracker creates a new in-flight tracker

func (*InflightTracker) Decrement

func (it *InflightTracker) Decrement(host string)

Decrement decrements the in-flight count for a host

func (*InflightTracker) GetAllCounts

func (it *InflightTracker) GetAllCounts() map[string]int

GetAllCounts returns all in-flight counts

func (*InflightTracker) GetCount

func (it *InflightTracker) GetCount(host string) int

GetCount returns the current in-flight count for a host

func (*InflightTracker) Increment

func (it *InflightTracker) Increment(host string)

Increment increments the in-flight count for a host

type KeyStatisticsDTO

type KeyStatisticsDTO struct {
	Symbol   string   `json:"symbol"`
	Market   string   `json:"market"`
	Currency Currency `json:"currency"`

	// Market metrics (from summaryDetail - real-time data)
	MarketCap    *Scaled `json:"market_cap,omitempty"`
	ForwardPE    *Scaled `json:"forward_pe,omitempty"`
	TrailingPE   *Scaled `json:"trailing_pe,omitempty"`
	Beta         *Scaled `json:"beta,omitempty"`
	PriceToSales *Scaled `json:"price_to_sales,omitempty"`

	// Share data
	SharesOutstanding *int64 `json:"shares_outstanding,omitempty"`
	FloatShares       *int64 `json:"float_shares,omitempty"`
	ShortInterest     *int64 `json:"short_interest,omitempty"`

	// Financial metrics (from financialData)
	EnterpriseValue  *Scaled `json:"enterprise_value,omitempty"`
	TotalCash        *Scaled `json:"total_cash,omitempty"`
	TotalDebt        *Scaled `json:"total_debt,omitempty"`
	QuickRatio       *Scaled `json:"quick_ratio,omitempty"`
	CurrentRatio     *Scaled `json:"current_ratio,omitempty"`
	DebtToEquity     *Scaled `json:"debt_to_equity,omitempty"`
	ReturnOnAssets   *Scaled `json:"return_on_assets,omitempty"`
	ReturnOnEquity   *Scaled `json:"return_on_equity,omitempty"`
	GrossMargins     *Scaled `json:"gross_margins,omitempty"`
	OperatingMargins *Scaled `json:"operating_margins,omitempty"`
	ProfitMargins    *Scaled `json:"profit_margins,omitempty"`
	RevenueGrowth    *Scaled `json:"revenue_growth,omitempty"`
	EarningsGrowth   *Scaled `json:"earnings_growth,omitempty"`

	// Price data
	FiftyTwoWeekHigh   *Scaled `json:"fifty_two_week_high,omitempty"`
	FiftyTwoWeekLow    *Scaled `json:"fifty_two_week_low,omitempty"`
	AverageVolume      *int64  `json:"average_volume,omitempty"`
	AverageVolume10Day *int64  `json:"average_volume_10_day,omitempty"`

	AsOf time.Time `json:"as_of"`
}

KeyStatisticsDTO represents extracted key statistics data

type LogEntry

type LogEntry struct {
	Timestamp string                 `json:"timestamp"`
	Level     string                 `json:"level"`
	Source    string                 `json:"source"`
	Message   string                 `json:"message"`
	Fields    map[string]interface{} `json:"fields,omitempty"`
}

LogEntry represents a structured log entry

type Logger

type Logger struct {
	// contains filtered or unexported fields
}

Logger handles structured logging for scraping operations

func NewLogger

func NewLogger() *Logger

NewLogger creates a new logger instance

func (*Logger) GetStats

func (l *Logger) GetStats() map[string]interface{}

GetStats returns logger statistics

func (*Logger) LogBackoff

func (l *Logger) LogBackoff(url, host string, delay time.Duration)

LogBackoff logs a backoff event

func (*Logger) LogConfig

func (l *Logger) LogConfig(config *Config)

LogConfig logs configuration information

func (*Logger) LogDebug

func (l *Logger) LogDebug(message string, fields map[string]interface{})

LogDebug logs a debug message

func (*Logger) LogError

func (l *Logger) LogError(message string, err error, fields map[string]interface{})

LogError logs a general error

func (*Logger) LogInfo

func (l *Logger) LogInfo(message string, fields map[string]interface{})

LogInfo logs an info message

func (*Logger) LogRateLimit

func (l *Logger) LogRateLimit(url, host, errorMsg string)

LogRateLimit logs a rate limit event

func (*Logger) LogRequest

func (l *Logger) LogRequest(url, host string, status, attempt int, duration time.Duration, bytes int, gzip bool, redirects int, errorMsg string)

LogRequest logs a scraping request

func (*Logger) LogRetry

func (l *Logger) LogRetry(url, host string, attempt int, reason, errorMsg string)

LogRetry logs a retry event

func (*Logger) LogRobotsDenied

func (l *Logger) LogRobotsDenied(url, host, errorMsg string)

LogRobotsDenied logs a robots.txt denial

func (*Logger) LogRobotsFetch

func (l *Logger) LogRobotsFetch(host string, success bool, errorMsg string)

LogRobotsFetch logs a robots.txt fetch event

func (*Logger) SetOutput

func (l *Logger) SetOutput(output interface{ Write([]byte) (int, error) })

SetOutput sets the output destination for the logger

type Metrics

type Metrics struct {
	// contains filtered or unexported fields
}

Metrics handles Prometheus metrics for scraping operations

func NewMetrics

func NewMetrics() *Metrics

NewMetrics creates a new metrics instance

func (*Metrics) GetStats

func (m *Metrics) GetStats() map[string]interface{}

GetStats returns current metrics statistics

func (*Metrics) RecordBackoff

func (m *Metrics) RecordBackoff(host, reason string)

RecordBackoff records a backoff event

func (*Metrics) RecordBackoffSleep

func (m *Metrics) RecordBackoffSleep(host, reason string, duration time.Duration)

RecordBackoffSleep records backoff sleep duration

func (*Metrics) RecordInflight

func (m *Metrics) RecordInflight(host string, count int)

RecordInflight records in-flight requests

func (*Metrics) RecordLatency

func (m *Metrics) RecordLatency(host string, duration time.Duration)

RecordLatency records request latency

func (*Metrics) RecordNews

func (m *Metrics) RecordNews(outcome string)

RecordNews records a news parsing operation

func (*Metrics) RecordNewsParseLatency

func (m *Metrics) RecordNewsParseLatency(duration time.Duration)

RecordNewsParseLatency records news parsing latency

func (*Metrics) RecordPageBytes

func (m *Metrics) RecordPageBytes(host string, bytes int)

RecordPageBytes records page size

func (*Metrics) RecordRequest

func (m *Metrics) RecordRequest(host, outcome, code string)

RecordRequest records a scraping request

func (*Metrics) RecordRetry

func (m *Metrics) RecordRetry(host, reason string)

RecordRetry records a retry event

func (*Metrics) RecordRobotsDenied

func (m *Metrics) RecordRobotsDenied(host string)

RecordRobotsDenied records a robots.txt denial

type NewsItem

type NewsItem struct {
	Title          string     `json:"title"`
	URL            string     `json:"url"` // absolute; normalized
	Source         string     `json:"source"`
	PublishedAt    *time.Time `json:"published_at"` // UTC if resolvable
	ImageURL       string     `json:"image_url"`
	RelatedTickers []string   `json:"related_tickers"`
}

NewsItem represents a single news article extracted from Yahoo Finance

type NewsRegexConfig

type NewsRegexConfig struct {
	ArticleContainer string `yaml:"article_container"`
	Title            string `yaml:"title"`
	ArticleLink      string `yaml:"article_link"`
	PublishingInfo   string `yaml:"publishing_info"`
	ImageURL         string `yaml:"image_url"`
	RelatedTickers   string `yaml:"related_tickers"`
	NextPageHint     string `yaml:"next_page_hint"`

	RelativeTime struct {
		Minutes   string `yaml:"minutes"`
		Hours     string `yaml:"hours"`
		Days      string `yaml:"days"`
		Weeks     string `yaml:"weeks"`
		Yesterday string `yaml:"yesterday"`
	} `yaml:"relative_time"`

	URLCleanup struct {
		UTMParams      string `yaml:"utm_params"`
		TrackingParams string `yaml:"tracking_params"`
		Fragment       string `yaml:"fragment"`
		QuerySeparator string `yaml:"query_separator"`
	} `yaml:"url_cleanup"`
}

NewsRegexConfig holds the regex patterns for news extraction

type NewsStats

type NewsStats struct {
	TotalFound    int       `json:"total_found"`
	TotalReturned int       `json:"total_returned"`
	Deduped       int       `json:"deduped"`
	NextPageHint  string    `json:"next_page_hint"` // e.g., a data-cursor or bool flag if detected
	AsOf          time.Time `json:"as_of"`
}

NewsStats represents statistics about news extraction

type Officer

type Officer struct {
	Name  string  `json:"name"`
	Title string  `json:"title"`
	Age   *int    `json:"age,omitempty"`
	Pay   *Scaled `json:"pay,omitempty"`
}

Officer represents a company officer/executive

type PeriodLine

type PeriodLine struct {
	PeriodStart time.Time `json:"period_start"`
	PeriodEnd   time.Time `json:"period_end"`
	Key         string    `json:"key"`
	Value       Scaled    `json:"value"`
	Currency    Currency  `json:"currency"`
}

PeriodLine represents a financial statement line item for a specific period

type ProfileDTO

type ProfileDTO struct {
	Symbol    string    `json:"symbol"`
	Market    string    `json:"market"`
	Company   string    `json:"company"`
	Address1  string    `json:"address1"`
	City      string    `json:"city"`
	State     string    `json:"state"`
	Country   string    `json:"country"`
	Phone     string    `json:"phone"`
	Website   string    `json:"website"`
	Industry  string    `json:"industry"`
	Sector    string    `json:"sector"`
	Employees *int      `json:"employees,omitempty"`
	Officers  []Officer `json:"officers"`
	AsOf      time.Time `json:"as_of"`
}

ProfileDTO represents extracted company profile data

type QuarterlyEPS

type QuarterlyEPS struct {
	Date     string  `json:"date"`
	Actual   *Scaled `json:"actual,omitempty"`
	Estimate *Scaled `json:"estimate,omitempty"`
}

QuarterlyEPS represents quarterly EPS estimates and actuals

type RateLimitConfig

type RateLimitConfig struct {
	QPS            float64
	Burst          int
	PerHostWorkers int
}

RateLimitConfig represents rate limiting configuration

func DefaultRateLimitConfig

func DefaultRateLimitConfig() *RateLimitConfig

DefaultRateLimitConfig returns a sensible default rate limit configuration

type RateLimiter

type RateLimiter struct {
	// contains filtered or unexported fields
}

RateLimiter implements per-host rate limiting

func NewRateLimiter

func NewRateLimiter(qps float64, burst int) *RateLimiter

NewRateLimiter creates a new rate limiter

func (*RateLimiter) Wait

func (rl *RateLimiter) Wait(ctx context.Context) error

Wait blocks until a token is available for the given host

type Recommendation

type Recommendation struct {
	Period     string `json:"period"`
	StrongBuy  int    `json:"strong_buy"`
	Buy        int    `json:"buy"`
	Hold       int    `json:"hold"`
	Sell       int    `json:"sell"`
	StrongSell int    `json:"strong_sell"`
}

Recommendation represents analyst recommendation data for a period

type RegexConfig

type RegexConfig struct {
	Current struct {
		MarketCap              string `yaml:"market_cap"`
		EnterpriseValue        string `yaml:"enterprise_value"`
		TrailingPE             string `yaml:"trailing_pe"`
		ForwardPE              string `yaml:"forward_pe"`
		PEGRatio               string `yaml:"peg_ratio"`
		PriceSales             string `yaml:"price_sales"`
		PriceBook              string `yaml:"price_book"`
		EnterpriseValueRevenue string `yaml:"enterprise_value_revenue"`
		EnterpriseValueEBITDA  string `yaml:"enterprise_value_ebitda"`
	} `yaml:"current"`

	Additional struct {
		Beta              string `yaml:"beta"`
		SharesOutstanding string `yaml:"shares_outstanding"`
		ProfitMargin      string `yaml:"profit_margin"`
		OperatingMargin   string `yaml:"operating_margin"`
		ReturnOnAssets    string `yaml:"return_on_assets"`
		ReturnOnEquity    string `yaml:"return_on_equity"`
	} `yaml:"additional"`

	HistoricalColumns struct {
		Column2 ColumnPatterns `yaml:"column_2"`
		Column3 ColumnPatterns `yaml:"column_3"`
		Column4 ColumnPatterns `yaml:"column_4"`
		Column5 ColumnPatterns `yaml:"column_5"`
		Column6 ColumnPatterns `yaml:"column_6"`
	} `yaml:"historical_columns"`

	DateHeaders string `yaml:"date_headers"`
}

RegexConfig holds the regex patterns for statistics extraction

type RetryConfig

type RetryConfig struct {
	Attempts   int `yaml:"attempts"`
	BaseMs     int `yaml:"base_ms"`
	MaxDelayMs int `yaml:"max_delay_ms"`
}

RetryConfig represents retry configuration

type RobotsCache

type RobotsCache struct {
	Host      string
	Rules     []RobotsRule
	FetchedAt time.Time
	TTL       time.Duration
}

RobotsCache represents cached robots.txt data

func (*RobotsCache) IsExpired

func (rc *RobotsCache) IsExpired() bool

IsExpired checks if the robots cache is expired

type RobotsManager

type RobotsManager struct {
	// contains filtered or unexported fields
}

RobotsManager handles robots.txt fetching, caching, and policy enforcement

func NewRobotsManager

func NewRobotsManager(policy string, ttl time.Duration) *RobotsManager

NewRobotsManager creates a new robots manager

func (*RobotsManager) CheckRobots

func (rm *RobotsManager) CheckRobots(ctx context.Context, host, path string) error

CheckRobots checks if a path is allowed by robots.txt

func (*RobotsManager) ClearCache

func (rm *RobotsManager) ClearCache()

ClearCache clears the robots.txt cache

func (*RobotsManager) GetCacheStats

func (rm *RobotsManager) GetCacheStats() map[string]interface{}

GetCacheStats returns cache statistics

type RobotsPolicy

type RobotsPolicy string

RobotsPolicy represents the robots.txt policy

const (
	RobotsEnforce RobotsPolicy = "enforce"
	RobotsWarn    RobotsPolicy = "warn"
	RobotsIgnore  RobotsPolicy = "ignore"
)

type RobotsRule

type RobotsRule struct {
	UserAgent string
	Allow     []string
	Disallow  []string
}

RobotsRule represents a robots.txt rule

type Scaled

type Scaled struct {
	Scaled int64 `json:"scaled"`
	Scale  int   `json:"scale"` // e.g., 2 for cents, 6 for micro-units
}

Scaled represents a scaled decimal number with precision preservation

func IntToScaled

func IntToScaled(i YahooInt, scale int) (Scaled, bool)

IntToScaled converts a YahooInt to a Scaled value with the given scale

func NumToScaled

func NumToScaled(n YahooNum, scale int) (Scaled, bool)

NumToScaled converts a YahooNum to a Scaled value with the given scale

func (Scaled) Float64

func (s Scaled) Float64() float64

Float64 returns the float64 value of Scaled

func (Scaled) String

func (s Scaled) String() string

String returns a human-readable representation of Scaled

type ScrapeError

type ScrapeError struct {
	Type    string
	Message string
	URL     string
	Status  int
}

ScrapeError represents a scraping-specific error

func ErrHTTP

func ErrHTTP(status int, url string) *ScrapeError

ErrHTTP creates an HTTP status error

func ErrMissingField

func ErrMissingField(field string) *ScrapeError

ErrMissingField creates a missing field error

func ErrSchemaDrift

func ErrSchemaDrift(field string) *ScrapeError

ErrSchemaDrift creates a schema drift error

func (*ScrapeError) Error

func (e *ScrapeError) Error() string

type Tracer

type Tracer struct {
	// contains filtered or unexported fields
}

Tracer handles OpenTelemetry tracing for scraping operations

func NewTracer

func NewTracer() *Tracer

NewTracer creates a new tracer instance

func (*Tracer) EndSpan

func (t *Tracer) EndSpan(span interface{})

EndSpan ends the span

func (*Tracer) GetStats

func (t *Tracer) GetStats() map[string]interface{}

GetStats returns tracer statistics

func (*Tracer) RecordSpanError

func (t *Tracer) RecordSpanError(span interface{}, err error)

RecordSpanError records an error in the span

func (*Tracer) StartFetchSpan

func (t *Tracer) StartFetchSpan(ctx context.Context, url, host string) (context.Context, interface{})

StartFetchSpan starts a new trace span for a fetch operation

func (*Tracer) UpdateSpan

func (t *Tracer) UpdateSpan(span interface{}, status, bytes int, duration time.Duration)

UpdateSpan updates span with response information

type YahooFinanceData

type YahooFinanceData struct {
	QuoteSummary struct {
		Result []struct {
			FinancialData struct {
				TrailingTotalRevenue                         []FinancialDataPoint `json:"trailingTotalRevenue"`
				AnnualTotalRevenue                           []FinancialDataPoint `json:"annualTotalRevenue"`
				TrailingOperatingIncome                      []FinancialDataPoint `json:"trailingOperatingIncome"`
				AnnualOperatingIncome                        []FinancialDataPoint `json:"annualOperatingIncome"`
				TrailingNetIncome                            []FinancialDataPoint `json:"trailingNetIncome"`
				AnnualNetIncome                              []FinancialDataPoint `json:"annualNetIncome"`
				TrailingBasicEPS                             []FinancialDataPoint `json:"trailingBasicEPS"`
				AnnualBasicEPS                               []FinancialDataPoint `json:"annualBasicEPS"`
				TrailingDilutedEPS                           []FinancialDataPoint `json:"trailingDilutedEPS"`
				AnnualDilutedEPS                             []FinancialDataPoint `json:"annualDilutedEPS"`
				TrailingEBITDA                               []FinancialDataPoint `json:"trailingEBITDA"`
				AnnualEBITDA                                 []FinancialDataPoint `json:"annualEBITDA"`
				TrailingGrossProfit                          []FinancialDataPoint `json:"trailingGrossProfit"`
				AnnualGrossProfit                            []FinancialDataPoint `json:"annualGrossProfit"`
				TrailingCostOfRevenue                        []FinancialDataPoint `json:"trailingCostOfRevenue"`
				AnnualCostOfRevenue                          []FinancialDataPoint `json:"annualCostOfRevenue"`
				TrailingOperatingExpense                     []FinancialDataPoint `json:"trailingOperatingExpense"`
				AnnualOperatingExpense                       []FinancialDataPoint `json:"annualOperatingExpense"`
				TrailingTotalExpenses                        []FinancialDataPoint `json:"trailingTotalExpenses"`
				AnnualTotalExpenses                          []FinancialDataPoint `json:"annualTotalExpenses"`
				TrailingTaxProvision                         []FinancialDataPoint `json:"trailingTaxProvision"`
				AnnualTaxProvision                           []FinancialDataPoint `json:"annualTaxProvision"`
				TrailingPretaxIncome                         []FinancialDataPoint `json:"trailingPretaxIncome"`
				AnnualPretaxIncome                           []FinancialDataPoint `json:"annualPretaxIncome"`
				TrailingOtherIncomeExpense                   []FinancialDataPoint `json:"trailingOtherIncomeExpense"`
				AnnualOtherIncomeExpense                     []FinancialDataPoint `json:"annualOtherIncomeExpense"`
				TrailingNetNonOperatingInterestIncomeExpense []FinancialDataPoint `json:"trailingNetNonOperatingInterestIncomeExpense"`
				AnnualNetNonOperatingInterestIncomeExpense   []FinancialDataPoint `json:"annualNetNonOperatingInterestIncomeExpense"`
				TrailingBasicAverageShares                   []FinancialDataPoint `json:"trailingBasicAverageShares"`
				AnnualBasicAverageShares                     []FinancialDataPoint `json:"annualBasicAverageShares"`
				TrailingDilutedAverageShares                 []FinancialDataPoint `json:"trailingDilutedAverageShares"`
				AnnualDilutedAverageShares                   []FinancialDataPoint `json:"annualDilutedAverageShares"`
				TrailingEBIT                                 []FinancialDataPoint `json:"trailingEBIT"`
				AnnualEBIT                                   []FinancialDataPoint `json:"annualEBIT"`
				TrailingNormalizedIncome                     []FinancialDataPoint `json:"trailingNormalizedIncome"`
				AnnualNormalizedIncome                       []FinancialDataPoint `json:"annualNormalizedIncome"`
				TrailingNormalizedEBITDA                     []FinancialDataPoint `json:"trailingNormalizedEBITDA"`
				AnnualNormalizedEBITDA                       []FinancialDataPoint `json:"annualNormalizedEBITDA"`
				TrailingReconciledCostOfRevenue              []FinancialDataPoint `json:"trailingReconciledCostOfRevenue"`
				AnnualReconciledCostOfRevenue                []FinancialDataPoint `json:"annualReconciledCostOfRevenue"`
				TrailingReconciledDepreciation               []FinancialDataPoint `json:"trailingReconciledDepreciation"`
				AnnualReconciledDepreciation                 []FinancialDataPoint `json:"annualReconciledDepreciation"`
			} `json:"financialData"`
		} `json:"result"`
	} `json:"quoteSummary"`
}

YahooFinanceData represents the JSON structure from Yahoo Finance

type YahooInt

type YahooInt struct {
	Raw     *int64 `json:"raw,omitempty"`
	Fmt     string `json:"fmt,omitempty"`
	LongFmt string `json:"longFmt,omitempty"`
}

YahooInt represents Yahoo's integer format with raw, fmt, and longFmt

func ToYahooInt

func ToYahooInt(raw *int64, fmt, longFmt string) YahooInt

ToYahooInt converts a raw struct to YahooInt

type YahooNum

type YahooNum struct {
	Raw     *float64 `json:"raw,omitempty"`
	Fmt     string   `json:"fmt,omitempty"`
	LongFmt string   `json:"longFmt,omitempty"`
}

YahooNum represents Yahoo's numeric format with raw, fmt, and longFmt

func ToYahooNum

func ToYahooNum(raw *float64, fmt, longFmt string) YahooNum

ToYahooNum converts a raw struct to YahooNum

type YahooString

type YahooString struct {
	Raw     *string `json:"raw,omitempty"`
	Fmt     string  `json:"fmt,omitempty"`
	LongFmt string  `json:"longFmt,omitempty"`
}

YahooString represents Yahoo's string format that might contain numbers

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL