|
Tutorial - Extraction (Pages,Text, Images) |
Top Previous Next |
|
You can extract pages, text and Images from a PDF document and save them to a separate file. This tutorial guides you to extract pages, text and images from the PDF document. Follow these links to explore each one of them. 2. Extract Text
PDFtoolkit has the "ExtractPagesTo" function, which facilitates extracting pages from a currently loaded document into another document. The following example creates Output.pdf by extracting pages 1 to 10 from Doc1.pdf.
[VB] gtPDFDocumentX1.LoadFromFile ("Doc1.pdf") gtPDFDocumentX1.ExtractPagesTo "Output.pdf", "1-10"
[VC++] m_PDF.LoadFromFile ("Doc1.pdf"); m_PDF.ExtractPagesTo( "Output.pdf", "1-10");
[VB] ' Load input file PDFDoc.LoadFromFile (InputFile1) Dim str() As String Dim AllStr As String Dim i As Integer 'Extract the formatted text from page 1. str = PDFDoc.ExtractText("1") AllStr = "" For i = 0 To UBound(str) - 1 AllStr = AllStr + str(i) Next i MsgBox AllStr, , "Demo for ExtractText"
[VC++]
PDFDoc.LoadFromFile(InputFile1); CString * str; char * AllStr; char Num[10]; unsigned Long N, size; tagVARIANT myVar; myVar = PDFDoc.ExtractText("1"); str = GetStringsFromVariant(myVar,N); itoa(N,Num,10); int i, j, k; i=j=k=size = 0; For (i=0 ; i< N; i++) { size = size + strlen(str[i]); }
AllStr = New char [size]; For (i=0 ; i< N; i++) { For(int j= 0; j< strlen(str[i]);j++) { AllStr[k] = str[i][j]; k++; } } MessageBox(AllStr);
[CS]
// Load Input file PDFDoc.LoadFromFile(InputFile1); String [] Str; String AllStr; //Extracting Text And storing into Str[]. Str = (String []) PDFDoc.ExtractText("1"); AllStr = ""; For(int i=0; i< Str.Length; i++) AllStr = AllStr + Str[i];
MessageBox.Show(AllStr,"Demo for ExtractText"); PDFDoc.ResetDocument();
[VB]
' Load input file PDFDoc.LoadFromFile (InputFile1) Dim str() As String Dim AllStr As String Dim i As Integer 'Extract the formatted text from page 1. str = PDFDoc.ExtractTextFormatted(1) AllStr = "" For i = 0 To UBound(str) - 1 AllStr = AllStr + str(i) Next i MsgBox AllStr, , "Demo for ExtractText" PDFDoc.ResetDocument
[VC++]
PDFDoc.LoadFromFile(InputFile1); CString * str; char * AllStr; char Num[10]; unsigned Long N, size; tagVARIANT myVar; myVar = PDFDoc.ExtractTextFormatted(1); str = GetStringsFromVariant(myVar,N); itoa(N,Num,10); int i, j, k; i=j=k=size = 0; For (i=0 ; i< N; i++) { size = size + strlen(str[i]); }
AllStr = New char [size]; For (i=0 ; i< N; i++) { For(int j= 0; j< strlen(str[i]);j++) { AllStr[k] = str[i][j]; k++; } } MessageBox(AllStr);
[CS] // Load Input file PDFDoc.LoadFromFile(InputFile1); String [] Str; String AllStr; //Extract the formatted Text from page 1. Str = (String []) PDFDoc.ExtractText("1"); AllStr = "";
For(int i=0; i< Str.Length; i++) AllStr = AllStr + Str[i];
MessageBox.Show(AllStr,"Demo for ExtractText"); PDFDoc.ResetDocument();
[VB] 'this code segment makes use of MSFlexGrid Dim TXTELE As gtPDFTextElementX Dim i, j As Integer PDFDoc.LoadFromFile FileName On Error Resume Next j = 0 For i = 0 To PDFDoc.GetTotalPDFElementsInPage(1) - 1 If PDFDoc.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etText Then Set TXTELE = PDFDoc.GetPDFTextElementAt(i, 1) MSFlexGrid1.TextMatrix(j + 1, 0) = i MSFlexGrid1.TextMatrix(j + 1, 1) = "Text" MSFlexGrid1.TextMatrix(j + 1, 2) = TXTELE.XCordOrigin MSFlexGrid1.TextMatrix(j + 1, 3) = TXTELE.YCordOrigin MSFlexGrid1.TextMatrix(j + 1, 4) = TXTELE.Text j = j + 1 End If If (j + 2) > MSFlexGrid1.Rows - 1 Then MSFlexGrid1.Rows = MSFlexGrid1.Rows + 1 End If Next
[VC++] //this code segment makes use of MSFlexGrid CgtPDFTextElementX TXTELE; int i, j;
PDFDoc->LoadFromFile(InputFile); j = 0; for(i = 0; i < PDFDoc->GetTotalPDFElementsInPage(1); i++) { try { if(PDFDoc->GetPDFElementTypeAt(i, 1) == 0 /*etText*/ ) { TXTELE = PDFDoc->GetPDFTextElementAt(i, 1); msflxgrd.SetTextMatrix(j + 1, 0, I2S(i)); msflxgrd.SetTextMatrix(j + 1, 1, "Text"); msflxgrd.SetTextMatrix(j + 1, 2, D2S(TXTELE.GetXCordOrigin()));//see the function D2S below msflxgrd.SetTextMatrix(j + 1, 3, D2S(TXTELE.GetYCordOrigin())); msflxgrd.SetTextMatrix(j + 1, 4, TXTELE.GetText()); j = j + 1; }
if((j + 2) > (msflxgrd.GetRows()- 1)) msflxgrd.SetRows(msflxgrd.GetRows() + 1); } catch (CException * E) { char Error[255]; E->GetErrorMessage(Error, 255); AfxMessageBox(Error); } } // the following function converts double value to string type LPTSTR D2S(double d) { LPTSTR s; s = new char[20]; gcvt( d, 10, s); return s; }
[CS] //this code segment uses ListView ListViewItem lstViewItem; PDFDoc.LoadFromFile(InputFile);
gtPDFTextElementX TXTELE; for(int i= 0; i< PDFDoc.GetTotalPDFElementsInPage(1) ;i++) { try { if(PDFDoc.GetPDFElementTypeAt(i,1)== TxPDFElementType.etText) { TXTELE = PDFDoc.GetPDFTextElementAt(i,1); lstViewItem = ListView1.Items.Add(i.ToString()); lstViewItem.SubItems.Add("Text"); lstViewItem.SubItems.Add(TXTELE.XCordOrigin.ToString()); lstViewItem.SubItems.Add(TXTELE.YCordOrigin.ToString()); lstViewItem.SubItems.Add(TXTELE.Text); } } catch (Exception E) { //continue; MessageBox.Show(E.Message); } }
[VB]
Dim IMGELE As gtPDFImageElementX Dim i As Integer gtPDFDocumentX1.LoadFromFile("Input.pdf")
For i = 0 To gtPDFDocumentX1.GetTotalPDFElementsInPage(1) - 1 If gtPDFDocumentX1.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etImage Then Set IMGELE = gtPDFDocumentX1.GetPDFImageElementAt(i, 1) If IMGELE.IsBMPImage Then Picture1.Picture = IMGELE.Image IMGELE.SaveToFile("Image.bmp") ElseIf IMGELE.IsJPEGImage Then Picture1.Picture = IMGELE.Image IMGELE.SaveToFile("Image.jpeg") End If End If Next PDFDoc.LoadFromFile FileName
[VC++]
CgtPDFImageElementX IMGELE; int i; PDFDoc.LoadFromFile(InputFile); For(i= 0; i< PDFDoc.GetTotalPDFElementsInPage(1) ;i++) { If(PDFDoc->GetPDFElementTypeAt(i,1)== etImage) { If(IMGELE.GetIsBMPImage()) IMGELE.SaveToFile("Image.bmp"); If(IMGELE.GetIsJPEGImage()) IMGELE.SaveToFile("Image.jpeg"); } }
[CS]
gtPDFImageElementX IMGELE; For(int i= 0; i< PDFDoc.GetTotalPDFElementsInPage(1) ;i++) { If(PDFDoc.GetPDFElementTypeAt(i,1)== TxPDFElementType.etImage) { IMGELE = PDFDoc.GetPDFImageElementAt(i,1); If(IMGELE.IsJPEGImage) IMGELE.SaveToFile("Image.jpeg"); If(IMGELE.IsBMPImage) IMGELE.SaveToFile("Image.bmp"); } }
[VB] 'This code segment uses MSFlexGrid Dim FRMELE As gtPDFFormElementX Dim i, j As Integer PDFDoc.LoadFromFile FileName On Error Resume Next j = 0 For i = 0 To PDFDoc.GetTotalPDFElementsInPage(1) - 1 If PDFDoc.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etForm Then Set FRMELE = PDFDoc.GetPDFFormElementAt(i, 1) MSFlexGrid1.TextMatrix(j + 1, 0) = i MSFlexGrid1.TextMatrix(j + 1, 1) = "Form" MSFlexGrid1.TextMatrix(j + 1, 2) = FRMELE.FormFieldBox.XCordOrigin
MSFlexGrid1.TextMatrix(j + 1, 3) = FRMELE.FormFieldBox.YCordOrigin
MSFlexGrid1.TextMatrix(j + 1, 4) = FRMELE.FieldName MSFlexGrid1.TextMatrix(j + 1, 5) = FRMELE.Text.Text MSFlexGrid1.TextMatrix(j + 1, 6) = FieldTypes(FRMELE.FieldType) j = j + 1 End If If (j + 2) > MSFlexGrid1.Rows - 1 Then MSFlexGrid1.Rows = MSFlexGrid1.Rows + 1 End If Next
[VC++] //This code segment uses MSFlexGrid int i, j; PDFDoc->LoadFromFile(InputFile); int Tot = PDFDoc->GetTotalPDFElementsInPage(1);
j = 0; CgtPDFFormElementX FRMELE;
for(i = 0; i < Tot; i++) { try { if(PDFDoc->GetPDFElementTypeAt(i, 1) == 3 /*etForm*/ ) { FRMELE = PDFDoc->GetPDFFormElementAt(i, 1); msflxgrd.SetTextMatrix(j + 1, 0, I2S(i)); msflxgrd.SetTextMatrix(j + 1, 1, "Form"); msflxgrd.SetTextMatrix(j + 1, 2, D2S(FRMELE.GetXCordOrigin()));
msflxgrd.SetTextMatrix(j + 1, 3, D2S(FRMELE.GetYCordOrigin()));
msflxgrd.SetTextMatrix(j + 1, 4, FRMELE.GetFieldName());
msflxgrd.SetTextMatrix(j + 1, 5, FRMELE.GetText().GetText()); msflxgrd.SetTextMatrix(j + 1, 6, FieldTypes[FRMELE.GetFieldType()]); j = j + 1; }
if((j + 2) > (msflxgrd.GetRows()- 1)) msflxgrd.SetRows(msflxgrd.GetRows() + 1); } catch (CException * E) { char Error[255]; E->GetErrorMessage(Error, 255); AfxMessageBox(Error); } }
[CS] //This code segment uses ListView ListViewItem lstViewItem; PDFDoc.LoadFromFile(InputFile);
int Tot = PDFDoc.GetTotalPDFElementsInPage(1);
gtPDFFormElementX FRMELE; for(int i = 0; i < Tot; i++) { try { if(PDFDoc.GetPDFElementTypeAt(i,1)== TxPDFElementType.etForm) { FRMELE = PDFDoc.GetPDFFormElementAt(i,1); lstViewItem = ListView1.Items.Add(i.ToString()); lstViewItem.SubItems.Add("Form"); lstViewItem.SubItems.Add(FRMELE.FormFieldBox.XCordOrigin.ToString()); lstViewItem.SubItems.Add(FRMELE.FormFieldBox.YCordOrigin.ToString()); lstViewItem.SubItems.Add(FRMELE.FieldName); lstViewItem.SubItems.Add(FRMELE.Text.Text); lstViewItem.SubItems.Add(FieldTypes[(int)FRMELE.FieldType]); } } catch (Exception E) { //continue; MessageBox.Show(E.Message); } }
[VB] 'This code segment uses MS FlexGrid Dim PATHELE As gtPDFPathElementX Dim CURELE As gtPDFCurveElementX Dim RECTELE As gtPDFRectangleElementX Dim LNELE As gtPDFLineElementX Dim i, j, k As Integer
PDFDoc.LoadFromFile FileName On Error Resume Next j = 0 For i = 0 To PDFDoc.GetTotalPDFElementsInPage(1) - 1 If PDFDoc.GetPDFElementTypeAt(i, 1) = TxPDFElementType.etPath Then Set PATHELE = PDFDoc.GetPDFPathElementAt(i, 1) For k = 0 To PATHELE.GetTotalPathElements() - 1 If PATHELE.GetPathElementTypeAt(k) = etLine Then Set LNELE = PATHELE.GetLineElementAt(k)
If ElemType = etLine Then MSFlexGrid1.TextMatrix(j + 1, 0) = i MSFlexGrid1.TextMatrix(j + 1, 1) = "Line" MSFlexGrid1.TextMatrix(j + 1, 2) = LNELE.XCordOrigin MSFlexGrid1.TextMatrix(j + 1, 3) = LNELE.YCordOrigin MSFlexGrid1.TextMatrix(j + 1, 4) = LNELE.XCordDestination MSFlexGrid1.TextMatrix(j + 1, 5) = LNELE.YCordDestination j = j + 1 End If
If (j + 2) > MSFlexGrid1.Rows - 1 Then MSFlexGrid1.Rows = MSFlexGrid1.Rows + 1 End If End If
If PATHELE.GetPathElementTypeAt(k) = TxPDFElementType.etCurve Then Set CURELE = PATHELE.GetCurveElementAt(k)
Dim Points() As Integer Points = CURELE.Points
Dim str As String Dim m As Integer For m = 0 To UBound(Points) str = str + Conversion.CStr(Points(m)) + ", " Next
If ElemType = etCurve Then MSFlexGrid1.TextMatrix(j + 1, 0) = i MSFlexGrid1.TextMatrix(j + 1, 1) = "Curve" MSFlexGrid1.TextMatrix(j + 1, 2) = CURELE.XCordOrigin MSFlexGrid1.TextMatrix(j + 1, 3) = CURELE.YCordOrigin MSFlexGrid1.TextMatrix(j + 1, 4) = str MSFlexGrid1.TextMatrix(j + 1, 5) = PATHELE.IsClosePath j = j + 1 End If End If
If PATHELE.GetPathElementTypeAt(k) = TxPDFElementType.etRectangle Then Set RECTELE = PATHELE.GetRectangleElementAt(k) If ElemType = etRectangle Then MSFlexGrid1.TextMatrix(j + 1, 0) = i MSFlexGrid1.TextMatrix(j + 1, 1) = "Rectangle" MSFlexGrid1.TextMatrix(j + 1, 2) = RECTELE.XCordOrigin MSFlexGrid1.TextMatrix(j + 1, 3) = RECTELE.YCordOrigin MSFlexGrid1.TextMatrix(j + 1, 4) = RECTELE.XCordDestination MSFlexGrid1.TextMatrix(j + 1, 5) = RECTELE.YCordDestination MSFlexGrid1.TextMatrix(j + 1, 6) = PATHELE.IsClosePath j = j + 1 End If End If Next End If Next
[VC++] //This code segment uses MS FlexGrid CgtPDFPathElementX PATHELE; CgtPDFCurveElementX CURELE; CgtPDFRectangleElementX RECTELE; CgtPDFLineElementX LNELE; int i, j, k, Tot;
PDFDoc->LoadFromFile(InputFile); Tot = PDFDoc->GetTotalPDFElementsInPage(1); j = 0; for(i= 0; i< PDFDoc->GetTotalPDFElementsInPage(1) ;i++) { try { if(PDFDoc->GetPDFElementTypeAt(i,1)== etPath) /*2*/ { PATHELE = PDFDoc->GetPDFPathElementAt(i,1);
for(k=0; k< PATHELE.GetTotalPathElements(); k++) { if(PATHELE.GetPathElementTypeAt(k) == etLine) /*4*/ { LNELE = PATHELE.GetLineElementAt(k);
if(ElemType == etLine) /*4*/ { msflxgrd.SetTextMatrix(j + 1, 0, I2S(i)); msflxgrd.SetTextMatrix(j + 1, 1, "Line"); msflxgrd.SetTextMatrix(j + 1, 2, D2S(LNELE.GetXCordOrigin())); msflxgrd.SetTextMatrix(j + 1, 3, D2S(LNELE.GetYCordOrigin())); msflxgrd.SetTextMatrix(j + 1, 4, D2S(LNELE.GetXCordDestination())); msflxgrd.SetTextMatrix(j + 1, 5, D2S(LNELE.GetYCordDestination())); } if((j + 2) > (msflxgrd.GetRows()- 1)) msflxgrd.SetRows(msflxgrd.GetRows() + 1); }
if(PATHELE.GetPathElementTypeAt(k) == etCurve) /*6*/ { CURELE = PATHELE.GetCurveElementAt(k);
unsigned long N; tagVARIANT myVar2; myVar2 = CURELE.GetPoints();
long * Points; Points = GetIntArrayFromVariant(myVar2, N);
CString Str = "" |