textextractor

package
v0.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 26, 2026 License: Apache-2.0 Imports: 19 Imported by: 0

Documentation

Overview

Package textextractor provides functionality to extract plain text from HWPX documents.

Index

Constants

View Source
const LevelCount = 10
View Source
const OutlineStyleEngNamePrefix = "Outline "

Variables

This section is empty.

Functions

func CreateExtractor

func CreateExtractor(objectType baseobject.ObjectType, manager comm.ExtractorManager, parameter comm.ParameterInterface) comm.Extractor

func Extract

func Extract(hwpxFile *object.HWPXFile, method TextExtractMethod, insertParaHead bool, textMarks *TextMarks) string

Extract extracts all text from the given HWPXFile using the specified method and marks. If insertParaHead is true, it also includes paragraph numbers/bullets in the output.

func ExtractFrom

func ExtractFrom(from baseobject.HWPXObject, method TextExtractMethod, textMarks *TextMarks) string

ExtractFrom extracts text starting from a specific HWPXObject within the document tree.

func ParaHeadNumber_ToString1

func ParaHeadNumber_ToString1(value int, format enumtype.NumberType1) string

func ParaHeadNumber_ToString2

func ParaHeadNumber_ToString2(value int, format enumtype.NumberType2) string

Types

type FromCaption

type FromCaption struct {
	*comm.ExtractorBase
}

func NewFromCaption

func NewFromCaption(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromCaption

func (*FromCaption) Extract

func (f *FromCaption) Extract(from baseobject.HWPXObject)

func (*FromCaption) ObjectType

func (f *FromCaption) ObjectType() baseobject.ObjectType

type FromContainer

type FromContainer struct {
	*comm.ExtractorBase
}

func NewFromContainer

func NewFromContainer(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromContainer

func (*FromContainer) Extract

func (f *FromContainer) Extract(from baseobject.HWPXObject)

func (*FromContainer) ObjectType

func (f *FromContainer) ObjectType() baseobject.ObjectType

type FromDrawText

type FromDrawText struct {
	*comm.ExtractorBase
}

func NewFromDrawText

func NewFromDrawText(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromDrawText

func (*FromDrawText) Extract

func (f *FromDrawText) Extract(from baseobject.HWPXObject)

func (*FromDrawText) ObjectType

func (f *FromDrawText) ObjectType() baseobject.ObjectType

type FromDrawingObject

type FromDrawingObject struct {
	*comm.ExtractorBase
	// contains filtered or unexported fields
}

func NewFromDrawingObject

func NewFromDrawingObject(manager comm.ExtractorManager, parameter comm.ParameterInterface, objectType baseobject.ObjectType) *FromDrawingObject

func (*FromDrawingObject) Extract

func (f *FromDrawingObject) Extract(from baseobject.HWPXObject)

func (*FromDrawingObject) ObjectType

func (f *FromDrawingObject) ObjectType() baseobject.ObjectType

type FromFieldBegin

type FromFieldBegin struct {
	*comm.ExtractorBase
}

func NewFromFieldBegin

func NewFromFieldBegin(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromFieldBegin

func (*FromFieldBegin) Extract

func (f *FromFieldBegin) Extract(from baseobject.HWPXObject)

func (*FromFieldBegin) ObjectType

func (f *FromFieldBegin) ObjectType() baseobject.ObjectType

type FromFieldEnd

type FromFieldEnd struct {
	*comm.ExtractorBase
}

func NewFromFieldEnd

func NewFromFieldEnd(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromFieldEnd

func (*FromFieldEnd) Extract

func (f *FromFieldEnd) Extract(from baseobject.HWPXObject)

func (*FromFieldEnd) ObjectType

func (f *FromFieldEnd) ObjectType() baseobject.ObjectType

type FromHWPXFile

type FromHWPXFile struct {
	*comm.ExtractorBase
}

func NewFromHWPXFile

func NewFromHWPXFile(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromHWPXFile

func (*FromHWPXFile) Extract

func (f *FromHWPXFile) Extract(from baseobject.HWPXObject)

func (*FromHWPXFile) ObjectType

func (f *FromHWPXFile) ObjectType() baseobject.ObjectType

type FromParaControl

type FromParaControl struct {
	*comm.ExtractorBase
}

func NewFromParaControl

func NewFromParaControl(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromParaControl

func (*FromParaControl) Extract

func (f *FromParaControl) Extract(from baseobject.HWPXObject)

func (*FromParaControl) ObjectType

func (f *FromParaControl) ObjectType() baseobject.ObjectType

type FromParaWithAppendingControlTextAfterParagraphText

type FromParaWithAppendingControlTextAfterParagraphText struct {
	*comm.ExtractorBase
}

func (*FromParaWithAppendingControlTextAfterParagraphText) Extract

func (*FromParaWithAppendingControlTextAfterParagraphText) ObjectType

type FromParaWithInsertingControlTextBetweenParagraphText

type FromParaWithInsertingControlTextBetweenParagraphText struct {
	*comm.ExtractorBase
}

func (*FromParaWithInsertingControlTextBetweenParagraphText) Extract

func (*FromParaWithInsertingControlTextBetweenParagraphText) ObjectType

type FromRun

type FromRun struct {
	*comm.ExtractorBase
}

func NewFromRun

func NewFromRun(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromRun

func (*FromRun) Extract

func (f *FromRun) Extract(from baseobject.HWPXObject)

func (*FromRun) ObjectType

func (f *FromRun) ObjectType() baseobject.ObjectType

type FromSectionXMLFile

type FromSectionXMLFile struct {
	*comm.ExtractorBase
}

func NewFromSectionXMLFile

func NewFromSectionXMLFile(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromSectionXMLFile

func (*FromSectionXMLFile) Extract

func (f *FromSectionXMLFile) Extract(from baseobject.HWPXObject)

func (*FromSectionXMLFile) ObjectType

func (f *FromSectionXMLFile) ObjectType() baseobject.ObjectType

type FromSubList

type FromSubList struct {
	*comm.ExtractorBase
}

func NewFromSubList

func NewFromSubList(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromSubList

func (*FromSubList) Extract

func (f *FromSubList) Extract(from baseobject.HWPXObject)

func (*FromSubList) ObjectType

func (f *FromSubList) ObjectType() baseobject.ObjectType

type FromT

type FromT struct {
	*comm.ExtractorBase
}

func NewFromT

func NewFromT(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromT

func (*FromT) Extract

func (f *FromT) Extract(from baseobject.HWPXObject)

func (*FromT) ObjectType

func (f *FromT) ObjectType() baseobject.ObjectType

type FromTable

type FromTable struct {
	*comm.ExtractorBase
}

func NewFromTable

func NewFromTable(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromTable

func (*FromTable) Extract

func (f *FromTable) Extract(from baseobject.HWPXObject)

func (*FromTable) ObjectType

func (f *FromTable) ObjectType() baseobject.ObjectType

type FromTc

type FromTc struct {
	*comm.ExtractorBase
}

func NewFromTc

func NewFromTc(manager comm.ExtractorManager, parameter comm.ParameterInterface) *FromTc

func (*FromTc) Extract

func (f *FromTc) Extract(from baseobject.HWPXObject)

func (*FromTc) ObjectType

func (f *FromTc) ObjectType() baseobject.ObjectType

type NoWorkingExtractor

type NoWorkingExtractor struct {
	*comm.ExtractorBase
}

func NewNoWorkingExtractor

func NewNoWorkingExtractor(manager comm.ExtractorManager, parameter comm.ParameterInterface) *NoWorkingExtractor

func (*NoWorkingExtractor) Extract

func (e *NoWorkingExtractor) Extract(from baseobject.HWPXObject)

func (*NoWorkingExtractor) ObjectType

func (e *NoWorkingExtractor) ObjectType() baseobject.ObjectType

type ParaHeadMaker

type ParaHeadMaker struct {
	// contains filtered or unexported fields
}

func NewParaHeadMaker

func NewParaHeadMaker(hwpxFile *object.HWPXFile) *ParaHeadMaker

func (*ParaHeadMaker) Make

func (m *ParaHeadMaker) Make(para *paragraph.Para, builder *comm.TextBuilder)

func (*ParaHeadMaker) StartSection

func (m *ParaHeadMaker) StartSection()

type ParaNumber

type ParaNumber struct {
	// contains filtered or unexported fields
}

func NewParaNumber

func NewParaNumber() *ParaNumber

func (*ParaNumber) ChangedParaHead

func (p *ParaNumber) ChangedParaHead(headID string) bool

func (*ParaNumber) Increase

func (p *ParaNumber) Increase(level int)

func (*ParaNumber) Reset

func (p *ParaNumber) Reset(headID string, level int, valueForLevel1 int)

func (*ParaNumber) Value

func (p *ParaNumber) Value(level int) int

type Parameter

type Parameter struct {
	// contains filtered or unexported fields
}

func NewParameter

func NewParameter(hwpxFile *object.HWPXFile, method TextExtractMethod, insertParaHead bool, textMarks *TextMarks) *Parameter

func (*Parameter) Result

func (p *Parameter) Result() string

func (*Parameter) TextBuilder

func (p *Parameter) TextBuilder() *comm.TextBuilder

func (*Parameter) TextExtractMethod

func (p *Parameter) TextExtractMethod() int

type TextExtractMethod

type TextExtractMethod int
const (
	TextExtractMethod_InsertControlTextBetweenParagraphText TextExtractMethod = iota
	TextExtractMethod_AppendControlTextAfterParagraphText
)

type TextMarks

type TextMarks struct {
	ParaSeparator      string
	LineBreak          string
	Tab                string
	FieldStart         string
	FieldEnd           string
	TableStart         string
	TableEnd           string
	TableRowSeparator  string
	TableCellSeparator string
	ContainerStart     string
	ContainerEnd       string
	LineStart          string
	LineEnd            string
	RectangleStart     string
	RectangleEnd       string
	EllipseStart       string
	EllipseEnd         string
	ArcStart           string
	ArcEnd             string
	PolygonStart       string
	PolygonEnd         string
	CurveStart         string
	CurveEnd           string
	ConnectLineStart   string
	ConnectLineEnd     string
	TextArtStart       string
	TextArtEnd         string
}

func NewTextMarks

func NewTextMarks() *TextMarks

func (*TextMarks) ArcEndAnd

func (m *TextMarks) ArcEndAnd(v string) *TextMarks

func (*TextMarks) ArcStartAnd

func (m *TextMarks) ArcStartAnd(v string) *TextMarks

func (*TextMarks) ConnectLineEndAnd

func (m *TextMarks) ConnectLineEndAnd(v string) *TextMarks

func (*TextMarks) ConnectLineStartAnd

func (m *TextMarks) ConnectLineStartAnd(v string) *TextMarks

func (*TextMarks) ContainerEndAnd

func (m *TextMarks) ContainerEndAnd(v string) *TextMarks

func (*TextMarks) ContainerStartAnd

func (m *TextMarks) ContainerStartAnd(v string) *TextMarks

func (*TextMarks) CurveEndAnd

func (m *TextMarks) CurveEndAnd(v string) *TextMarks

func (*TextMarks) CurveStartAnd

func (m *TextMarks) CurveStartAnd(v string) *TextMarks

func (*TextMarks) EllipseEndAnd

func (m *TextMarks) EllipseEndAnd(v string) *TextMarks

func (*TextMarks) EllipseStartAnd

func (m *TextMarks) EllipseStartAnd(v string) *TextMarks

func (*TextMarks) FieldEndAnd

func (m *TextMarks) FieldEndAnd(v string) *TextMarks

func (*TextMarks) FieldStartAnd

func (m *TextMarks) FieldStartAnd(v string) *TextMarks

func (*TextMarks) GetArcEnd

func (m *TextMarks) GetArcEnd() string

func (*TextMarks) GetArcStart

func (m *TextMarks) GetArcStart() string

func (*TextMarks) GetConnectLineEnd

func (m *TextMarks) GetConnectLineEnd() string

func (*TextMarks) GetConnectLineStart

func (m *TextMarks) GetConnectLineStart() string

func (*TextMarks) GetContainerEnd

func (m *TextMarks) GetContainerEnd() string

func (*TextMarks) GetContainerStart

func (m *TextMarks) GetContainerStart() string

func (*TextMarks) GetCurveEnd

func (m *TextMarks) GetCurveEnd() string

func (*TextMarks) GetCurveStart

func (m *TextMarks) GetCurveStart() string

func (*TextMarks) GetEllipseEnd

func (m *TextMarks) GetEllipseEnd() string

func (*TextMarks) GetEllipseStart

func (m *TextMarks) GetEllipseStart() string

func (*TextMarks) GetFieldEnd

func (m *TextMarks) GetFieldEnd() string

func (*TextMarks) GetFieldStart

func (m *TextMarks) GetFieldStart() string

func (*TextMarks) GetLineBreak

func (m *TextMarks) GetLineBreak() string

func (*TextMarks) GetLineEnd

func (m *TextMarks) GetLineEnd() string

func (*TextMarks) GetLineStart

func (m *TextMarks) GetLineStart() string

func (*TextMarks) GetParaSeparator

func (m *TextMarks) GetParaSeparator() string

func (*TextMarks) GetPolygonEnd

func (m *TextMarks) GetPolygonEnd() string

func (*TextMarks) GetPolygonStart

func (m *TextMarks) GetPolygonStart() string

func (*TextMarks) GetRectangleEnd

func (m *TextMarks) GetRectangleEnd() string

func (*TextMarks) GetRectangleStart

func (m *TextMarks) GetRectangleStart() string

func (*TextMarks) GetTab

func (m *TextMarks) GetTab() string

func (*TextMarks) GetTableCellSeparator

func (m *TextMarks) GetTableCellSeparator() string

func (*TextMarks) GetTableEnd

func (m *TextMarks) GetTableEnd() string

func (*TextMarks) GetTableRowSeparator

func (m *TextMarks) GetTableRowSeparator() string

func (*TextMarks) GetTableStart

func (m *TextMarks) GetTableStart() string

func (*TextMarks) GetTextArtEnd

func (m *TextMarks) GetTextArtEnd() string

func (*TextMarks) GetTextArtStart

func (m *TextMarks) GetTextArtStart() string

func (*TextMarks) LineBreakAnd

func (m *TextMarks) LineBreakAnd(v string) *TextMarks

func (*TextMarks) LineEndAnd

func (m *TextMarks) LineEndAnd(v string) *TextMarks

func (*TextMarks) LineStartAnd

func (m *TextMarks) LineStartAnd(v string) *TextMarks

func (*TextMarks) ParaSeparatorAnd

func (m *TextMarks) ParaSeparatorAnd(v string) *TextMarks

func (*TextMarks) PolygonEndAnd

func (m *TextMarks) PolygonEndAnd(v string) *TextMarks

func (*TextMarks) PolygonStartAnd

func (m *TextMarks) PolygonStartAnd(v string) *TextMarks

func (*TextMarks) RectangleEndAnd

func (m *TextMarks) RectangleEndAnd(v string) *TextMarks

func (*TextMarks) RectangleStartAnd

func (m *TextMarks) RectangleStartAnd(v string) *TextMarks

func (*TextMarks) TabAnd

func (m *TextMarks) TabAnd(v string) *TextMarks

func (*TextMarks) TableCellSeparatorAnd

func (m *TextMarks) TableCellSeparatorAnd(v string) *TextMarks

func (*TextMarks) TableEndAnd

func (m *TextMarks) TableEndAnd(v string) *TextMarks

func (*TextMarks) TableRowSeparatorAnd

func (m *TextMarks) TableRowSeparatorAnd(v string) *TextMarks

func (*TextMarks) TableStartAnd

func (m *TextMarks) TableStartAnd(v string) *TextMarks

func (*TextMarks) TextArtStartAnd

func (m *TextMarks) TextArtStartAnd(v string) *TextMarks

Directories

Path Synopsis
Package comm provides common interfaces and base structures for text extraction.
Package comm provides common interfaces and base structures for text extraction.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL