Tutorial - Extraction (Pages,Text, Images)

Top  Previous  Next

You can extract pages, text and Images from a PDF document and save them to a separate file. This tutorial guides you to extract pages, text and images from the PDF document. Follow these links to explore each one of them.

1.        Extract Pages

2.        Extract Text        

3.        Extract Text Formatted

4. Extract Text Elements

5.        Extract Images

6. Form Element Extraction
7.        Path Element Extraction                                

 

 

 

1)Extract Pages

 

PDFtoolkit has the "ExtractPagesTo" function, which facilitates extracting pages from a currently loaded document into another document. The following example creates Output.pdf by extracting pages 1 to 10 from Doc1.pdf.         

 

[VB]         

gtPDFDocumentX1.LoadFromFile ("Doc1.pdf") 

gtPDFDocumentX1.ExtractPagesTo "Output.pdf", "1-10"

 

[VC++] 

m_PDF.LoadFromFile ("Doc1.pdf"); 

m_PDF.ExtractPagesTo( "Output.pdf", "1-10");        

 

 

 

2)Extract Text

 

 

[VB]

' Load input file

PDFDoc.LoadFromFile (InputFile1)

Dim str() As String

Dim AllStr As String

Dim i As Integer

'Extract the formatted text from page 1.

str = PDFDoc.ExtractText("1")

AllStr = ""

For i = 0 To UBound(str) - 1

AllStr = AllStr + str(i)

Next i

MsgBox AllStr, , "Demo for ExtractText"

 

 

[VC++] 

 

PDFDoc.LoadFromFile(InputFile1);

CString * str;

char * AllStr;

char Num[10];

unsigned Long N, size;

tagVARIANT myVar;

myVar = PDFDoc.ExtractText("1");

str  = GetStringsFromVariant(myVar,N);

itoa(N,Num,10);

int i, j, k;

i=j=k=size = 0;

For (i=0 ; i< N; i++)

{

       size = size + strlen(str[i]);        

}

 

AllStr = New char [size];

For (i=0 ; i< N; i++)

{

       For(int j= 0; j< strlen(str[i]);j++)

       {

               AllStr[k] = str[i][j];

               k++;

       }

}

MessageBox(AllStr);

 

[CS]

 

// Load Input file

PDFDoc.LoadFromFile(InputFile1);

String [] Str;

String AllStr;

//Extracting Text And storing into Str[].

Str = (String []) PDFDoc.ExtractText("1");

AllStr = "";

For(int i=0; i< Str.Length; i++)

       AllStr = AllStr + Str[i];

 

MessageBox.Show(AllStr,"Demo for ExtractText");

PDFDoc.ResetDocument();

 

 

3)Extract Text Formatted

 

[VB]

 

' Load input file

PDFDoc.LoadFromFile (InputFile1)

Dim str() As String

Dim AllStr As String

Dim i As Integer

'Extract the formatted text from page 1.

str = PDFDoc.ExtractTextFormatted(1)

AllStr = ""

For i = 0 To UBound(str) - 1

AllStr = AllStr + str(i)

Next i

MsgBox AllStr, , "Demo for ExtractText"

PDFDoc.ResetDocument

 

[VC++] 

 

PDFDoc.LoadFromFile(InputFile1);

CString * str;

char * AllStr;

char Num[10];

unsigned Long N, size;

tagVARIANT myVar;

myVar = PDFDoc.ExtractTextFormatted(1);

str  = GetStringsFromVariant(myVar,N);

itoa(N,Num,10);

int i, j, k;

i=j=k=size = 0;

For (i=0 ; i< N; i++)

{

       size = size + strlen(str[i]);        

}

 

AllStr = New char [size];

For (i=0 ; i< N; i++)

{

       For(int j= 0; j< strlen(str[i]);j++)

       {

               AllStr[k] = str[i][j];

               k++;

       }

}

MessageBox(AllStr);

       

[CS]

// Load Input file

PDFDoc.LoadFromFile(InputFile1);

String [] Str;

String AllStr;

//Extract the formatted Text from page 1.

Str = (String []) PDFDoc.ExtractText("1");

AllStr = "";

 

For(int i=0; i< Str.Length; i++)

       AllStr = AllStr + Str[i];

 

MessageBox.Show(AllStr,"Demo for ExtractText");

PDFDoc.ResetDocument();

 

 

 

4)Extract Text Elements

 

[VB]

'this code segment makes use of MSFlexGrid

Dim TXTELE As gtPDFTextElementX

Dim i, j As Integer

PDFDoc.LoadFromFile FileName

On Error Resume Next

j = 0

For i = 0 To PDFDoc.GetTotalPDFElementsInPage(1) - 1

If PDFDoc.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etText Then

Set TXTELE = PDFDoc.GetPDFTextElementAt(i, 1)

       MSFlexGrid1.TextMatrix(j + 1, 0) = i

       MSFlexGrid1.TextMatrix(j + 1, 1) = "Text"

       MSFlexGrid1.TextMatrix(j + 1, 2) = TXTELE.XCordOrigin

       MSFlexGrid1.TextMatrix(j + 1, 3) = TXTELE.YCordOrigin

       MSFlexGrid1.TextMatrix(j + 1, 4) = TXTELE.Text

       j = j + 1

     End If

     If (j + 2) > MSFlexGrid1.Rows - 1 Then

        MSFlexGrid1.Rows = MSFlexGrid1.Rows + 1

     End If

Next

 

 

[VC++]

//this code segment makes use of MSFlexGrid

CgtPDFTextElementX TXTELE;

int i, j;

 

PDFDoc->LoadFromFile(InputFile);

j = 0;

for(i = 0; i < PDFDoc->GetTotalPDFElementsInPage(1); i++)

{

try

{

   if(PDFDoc->GetPDFElementTypeAt(i, 1) == 0 /*etText*/ )

   {

     TXTELE = PDFDoc->GetPDFTextElementAt(i, 1);

     msflxgrd.SetTextMatrix(j + 1, 0, I2S(i));

     msflxgrd.SetTextMatrix(j + 1, 1, "Text");

msflxgrd.SetTextMatrix(j + 1, 2, D2S(TXTELE.GetXCordOrigin()));//see the function D2S below

     msflxgrd.SetTextMatrix(j + 1, 3, D2S(TXTELE.GetYCordOrigin()));

     msflxgrd.SetTextMatrix(j + 1, 4, TXTELE.GetText());

     j = j + 1;

   }

 

   if((j + 2) > (msflxgrd.GetRows()- 1))

     msflxgrd.SetRows(msflxgrd.GetRows() + 1);

}

catch (CException * E)

{

   char Error[255];

   E->GetErrorMessage(Error, 255);

   AfxMessageBox(Error);

}

}

// the following function converts double value to string type

LPTSTR D2S(double d)

{

LPTSTR s;

s = new char[20];

gcvt( d, 10, s);

return s;

}

 

[CS]

//this code segment uses ListView

ListViewItem lstViewItem;

PDFDoc.LoadFromFile(InputFile);

 

gtPDFTextElementX TXTELE;

for(int i= 0; i< PDFDoc.GetTotalPDFElementsInPage(1) ;i++)

{

try

{

   if(PDFDoc.GetPDFElementTypeAt(i,1)== TxPDFElementType.etText)

   {

     TXTELE = PDFDoc.GetPDFTextElementAt(i,1);

     lstViewItem = ListView1.Items.Add(i.ToString());

     lstViewItem.SubItems.Add("Text");

     lstViewItem.SubItems.Add(TXTELE.XCordOrigin.ToString());

     lstViewItem.SubItems.Add(TXTELE.YCordOrigin.ToString());

     lstViewItem.SubItems.Add(TXTELE.Text);

   }

}

catch (Exception E)

{

   //continue;

   MessageBox.Show(E.Message); 

}

}

 

 

5) Extract Images

 

 

[VB]

 

Dim IMGELE As gtPDFImageElementX

Dim i As Integer

gtPDFDocumentX1.LoadFromFile("Input.pdf")

 

For i = 0 To gtPDFDocumentX1.GetTotalPDFElementsInPage(1) - 1

   If gtPDFDocumentX1.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etImage Then

       Set IMGELE = gtPDFDocumentX1.GetPDFImageElementAt(i, 1)

       If IMGELE.IsBMPImage Then

         Picture1.Picture = IMGELE.Image

        IMGELE.SaveToFile("Image.bmp")

       ElseIf IMGELE.IsJPEGImage Then

         Picture1.Picture = IMGELE.Image

        IMGELE.SaveToFile("Image.jpeg")

       End If

   End If

Next

PDFDoc.LoadFromFile FileName

 

[VC++]

 

CgtPDFImageElementX IMGELE;

int i;

PDFDoc.LoadFromFile(InputFile);

For(i= 0; i< PDFDoc.GetTotalPDFElementsInPage(1) ;i++)

{

       If(PDFDoc->GetPDFElementTypeAt(i,1)== etImage)

       {

               If(IMGELE.GetIsBMPImage())

                   IMGELE.SaveToFile("Image.bmp");

                       If(IMGELE.GetIsJPEGImage())

                               IMGELE.SaveToFile("Image.jpeg");

       }

}

 

 

[CS]

 

gtPDFImageElementX IMGELE;

For(int i= 0; i< PDFDoc.GetTotalPDFElementsInPage(1) ;i++)

{

       If(PDFDoc.GetPDFElementTypeAt(i,1)== TxPDFElementType.etImage)

       {

               IMGELE = PDFDoc.GetPDFImageElementAt(i,1);

               If(IMGELE.IsJPEGImage)

                       IMGELE.SaveToFile("Image.jpeg");                        

               If(IMGELE.IsBMPImage)

                       IMGELE.SaveToFile("Image.bmp");

       }

}

 

 

 

6)Form Element Extraction

 

[VB]

'This code segment uses MSFlexGrid

Dim FRMELE As gtPDFFormElementX

Dim i, j As Integer

PDFDoc.LoadFromFile FileName

On Error Resume Next

j = 0

For i = 0 To PDFDoc.GetTotalPDFElementsInPage(1) - 1

If PDFDoc.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etForm Then

   Set FRMELE = PDFDoc.GetPDFFormElementAt(i, 1)

     MSFlexGrid1.TextMatrix(j + 1, 0) = i

MSFlexGrid1.TextMatrix(j + 1, 1) = "Form"

MSFlexGrid1.TextMatrix(j + 1, 2) = FRMELE.FormFieldBox.XCordOrigin

 

MSFlexGrid1.TextMatrix(j + 1, 3) = FRMELE.FormFieldBox.YCordOrigin

 

MSFlexGrid1.TextMatrix(j + 1, 4) = FRMELE.FieldName

MSFlexGrid1.TextMatrix(j + 1, 5) = FRMELE.Text.Text

MSFlexGrid1.TextMatrix(j + 1, 6) = FieldTypes(FRMELE.FieldType)

j = j + 1

End If

If (j + 2) > MSFlexGrid1.Rows - 1 Then

MSFlexGrid1.Rows = MSFlexGrid1.Rows + 1

End If

Next

 

[VC++]

//This code segment uses MSFlexGrid

int i, j;

PDFDoc->LoadFromFile(InputFile);

int Tot = PDFDoc->GetTotalPDFElementsInPage(1);

 

j = 0;

CgtPDFFormElementX FRMELE;

 

for(i = 0; i < Tot; i++)

{

try

{

   if(PDFDoc->GetPDFElementTypeAt(i, 1) == 3 /*etForm*/ )

   {

     FRMELE = PDFDoc->GetPDFFormElementAt(i, 1);

     msflxgrd.SetTextMatrix(j + 1, 0, I2S(i));

     msflxgrd.SetTextMatrix(j + 1, 1, "Form");

     msflxgrd.SetTextMatrix(j + 1, 2, D2S(FRMELE.GetXCordOrigin()));

 

     msflxgrd.SetTextMatrix(j + 1, 3, D2S(FRMELE.GetYCordOrigin()));

 

     msflxgrd.SetTextMatrix(j + 1, 4, FRMELE.GetFieldName());

 

     msflxgrd.SetTextMatrix(j + 1, 5, FRMELE.GetText().GetText());

     msflxgrd.SetTextMatrix(j + 1, 6, FieldTypes[FRMELE.GetFieldType()]);

     j = j + 1;

   }

 

     if((j + 2) > (msflxgrd.GetRows()- 1))

       msflxgrd.SetRows(msflxgrd.GetRows() + 1);

  }

  catch (CException * E)

  {

     char Error[255];

     E->GetErrorMessage(Error, 255);

     AfxMessageBox(Error);

  }

}

 

[CS]

//This code segment uses ListView

ListViewItem lstViewItem;

PDFDoc.LoadFromFile(InputFile);

 

int Tot = PDFDoc.GetTotalPDFElementsInPage(1);

 

gtPDFFormElementX FRMELE;

for(int i = 0; i < Tot; i++)

{

try

{

   if(PDFDoc.GetPDFElementTypeAt(i,1)== TxPDFElementType.etForm)

   {

   FRMELE = PDFDoc.GetPDFFormElementAt(i,1);

   lstViewItem = ListView1.Items.Add(i.ToString());

   lstViewItem.SubItems.Add("Form");

   lstViewItem.SubItems.Add(FRMELE.FormFieldBox.XCordOrigin.ToString());

   lstViewItem.SubItems.Add(FRMELE.FormFieldBox.YCordOrigin.ToString());

   lstViewItem.SubItems.Add(FRMELE.FieldName);

   lstViewItem.SubItems.Add(FRMELE.Text.Text); 

   lstViewItem.SubItems.Add(FieldTypes[(int)FRMELE.FieldType]);

   }

}

catch (Exception E)

{

   //continue;

   MessageBox.Show(E.Message);

}

}

 

 

 

7)Path Element Extraction

 

[VB]

'This code segment uses MS FlexGrid

Dim PATHELE As gtPDFPathElementX

Dim CURELE As gtPDFCurveElementX

Dim RECTELE As gtPDFRectangleElementX

Dim LNELE As gtPDFLineElementX

Dim i, j, k As Integer

 

PDFDoc.LoadFromFile FileName

On Error Resume Next

j = 0

For i = 0 To PDFDoc.GetTotalPDFElementsInPage(1) - 1

   If PDFDoc.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etPath Then

       Set PATHELE = PDFDoc.GetPDFPathElementAt(i, 1)

       For k = 0 To PATHELE.GetTotalPathElements() - 1

           If PATHELE.GetPathElementTypeAt(k) = etLine Then

               Set LNELE = PATHELE.GetLineElementAt(k)

 

               If ElemType = etLine Then

                  MSFlexGrid1.TextMatrix(j + 1, 0) = i

                  MSFlexGrid1.TextMatrix(j + 1, 1) = "Line"

                  MSFlexGrid1.TextMatrix(j + 1, 2) = LNELE.XCordOrigin

                  MSFlexGrid1.TextMatrix(j + 1, 3) = LNELE.YCordOrigin

                  MSFlexGrid1.TextMatrix(j + 1, 4) = LNELE.XCordDestination

                  MSFlexGrid1.TextMatrix(j + 1, 5) = LNELE.YCordDestination

                  j = j + 1

               End If

 

               If (j + 2) > MSFlexGrid1.Rows - 1 Then

                   MSFlexGrid1.Rows = MSFlexGrid1.Rows + 1

               End If

           End If

 

           If PATHELE.GetPathElementTypeAt(k) = TxPDFElementType.etCurve Then

               Set CURELE = PATHELE.GetCurveElementAt(k)

 

               Dim Points() As Integer

               Points = CURELE.Points

 

               Dim str As String

               Dim m As Integer

               For m = 0 To UBound(Points)

                   str = str + Conversion.CStr(Points(m)) + ", "

               Next

 

               If ElemType = etCurve Then

                  MSFlexGrid1.TextMatrix(j + 1, 0) = i

                  MSFlexGrid1.TextMatrix(j + 1, 1) = "Curve"

                  MSFlexGrid1.TextMatrix(j + 1, 2) = CURELE.XCordOrigin

                  MSFlexGrid1.TextMatrix(j + 1, 3) = CURELE.YCordOrigin

                  MSFlexGrid1.TextMatrix(j + 1, 4) = str

                  MSFlexGrid1.TextMatrix(j + 1, 5) = PATHELE.IsClosePath

                  j = j + 1

               End If

           End If

 

           If PATHELE.GetPathElementTypeAt(k) = TxPDFElementType.etRectangle Then

               Set RECTELE = PATHELE.GetRectangleElementAt(k)

               If ElemType = etRectangle Then

                  MSFlexGrid1.TextMatrix(j + 1, 0) = i

                  MSFlexGrid1.TextMatrix(j + 1, 1) = "Rectangle"

                  MSFlexGrid1.TextMatrix(j + 1, 2) = RECTELE.XCordOrigin

                  MSFlexGrid1.TextMatrix(j + 1, 3) = RECTELE.YCordOrigin

                  MSFlexGrid1.TextMatrix(j + 1, 4) = RECTELE.XCordDestination

                  MSFlexGrid1.TextMatrix(j + 1, 5) = RECTELE.YCordDestination

                  MSFlexGrid1.TextMatrix(j + 1, 6) = PATHELE.IsClosePath

                  j = j + 1

               End If

           End If

       Next

   End If

Next

 

[VC++]

//This code segment uses MS FlexGrid

CgtPDFPathElementX PATHELE;

CgtPDFCurveElementX CURELE;

CgtPDFRectangleElementX RECTELE;

CgtPDFLineElementX LNELE;

int i, j, k, Tot;

 

PDFDoc->LoadFromFile(InputFile);

Tot = PDFDoc->GetTotalPDFElementsInPage(1);

j = 0;

for(i= 0; i< PDFDoc->GetTotalPDFElementsInPage(1) ;i++)

{

try

{

   if(PDFDoc->GetPDFElementTypeAt(i,1)==  etPath)  /*2*/

   {

     PATHELE = PDFDoc->GetPDFPathElementAt(i,1);

 

     for(k=0; k< PATHELE.GetTotalPathElements(); k++)

     {

       if(PATHELE.GetPathElementTypeAt(k) == etLine)    /*4*/

       {

         LNELE = PATHELE.GetLineElementAt(k);

 

         if(ElemType ==  etLine)    /*4*/

         {

           msflxgrd.SetTextMatrix(j + 1, 0, I2S(i));

           msflxgrd.SetTextMatrix(j + 1, 1, "Line");

           msflxgrd.SetTextMatrix(j + 1, 2, D2S(LNELE.GetXCordOrigin()));

           msflxgrd.SetTextMatrix(j + 1, 3, D2S(LNELE.GetYCordOrigin()));

           msflxgrd.SetTextMatrix(j + 1, 4, D2S(LNELE.GetXCordDestination()));

           msflxgrd.SetTextMatrix(j + 1, 5, D2S(LNELE.GetYCordDestination()));

         }

         if((j + 2) > (msflxgrd.GetRows()- 1))

           msflxgrd.SetRows(msflxgrd.GetRows() + 1);

       }

 

       if(PATHELE.GetPathElementTypeAt(k) == etCurve) /*6*/

       {

         CURELE = PATHELE.GetCurveElementAt(k);

 

         unsigned long N;

         tagVARIANT myVar2;

         myVar2 = CURELE.GetPoints();

 

         long * Points;

         Points = GetIntArrayFromVariant(myVar2, N);

 

                   CString Str = ""