qué - Cómo extraer texto del archivo de Word.doc, docx,.xlsx,.pptx php

qué es un archivo doc y docx (4)

// Para DOCX. Si desea conservar espacios en blanco, también cuide las tablas tr y tc, use los códigos a continuación: Modifíquelo a su gusto. Porque descarga el archivo desde un control remoto o local

//=========DOCX=========== function extractDocxText($url,$file_name){ $docx = get_url($url); file_put_contents("tempf.docx",$docx); $xml_filename = "word/document.xml"; //content file name $zip_handle = new ZipArchive; $output_text = ""; if(true === $zip_handle->open("tempf.docx")){ if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); //file_put_contents($input_file.".xml",$xml_datas); $replace_newlines = preg_replace(''/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/'',"/n/r",$xml_datas); $replace_tableRows = preg_replace(''/<w:tr>/'',"/n/r",$replace_newlines); $replace_tab = preg_replace(''/<w:tab//>/'',"/t",$replace_tableRows); $replace_paragraphs = preg_replace(''/<//w:p>/'',"/n/r",$replace_tab); $replace_other_Tags = strip_tags($replace_paragraphs); $output_text = $replace_other_Tags; }else{ $output_text .=""; } $zip_handle->close(); }else{ $output_text .=" "; } chmod("tempf.docx", 0777); unlink(realpath("tempf.docx")); //save to file or echo content file_put_contents($file_name,$output_text); echo $output_text; } //========PDF=========== //Requires installation in your Linux server //sudo su //apt-get install xpdf function extractPdfText($url,$PDF_fullpath_or_Filename){ $pdf = get_url($url); file_put_contents ("temppdf.txt", $pdf); $content = pdf2text("temppdf.txt"); chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt")); echo $content; file_put_contents($PDF_fullpath_or_Filename,$content); } //========DOC========== function extractDocText($url,$file_name){ $doc = get_url($url); file_put_contents ("tempf.txt", $doc); $fileHandle = fopen("tempf.txt", "r"); $line = @fread($fileHandle, filesize("tempf.txt")); $lines = explode(chr(0x0D),$line); $outtext = ""; foreach($lines as $thisline){ $pos = strpos($thisline, chr(0x00)); if (($pos !== FALSE)||(strlen($thisline)==0)) {} else {$outtext .= $thisline."/n/r";} } $content = preg_replace(''/[a-zA-Z0-9/s/,/./-/n/r/t@///_/(/)]/'','' '',$outtext); //chmod("tempf.txt", 0777); unlink(realpath("tempf.txt")); echo $content; file_put_contents($file_name,$content); } //========XLSX========== function extractXlsxText($url,$file_name){ $xlsx = get_url($url); file_put_contents ("tempf.txt", $xlsx); $content = ""; $dir = ''tempforxlsx''; // Unzip $zip = new ZipArchive(); $zip->open("tempf.txt"); $zip->extractTo($dir); // Open up shared strings & the first worksheet $strings = simplexml_load_file($dir . ''/xl/sharedStrings.xml''); $sheet = simplexml_load_file($dir . ''/xl/worksheets/sheet1.xml''); // Parse the rows $xlrows = $sheet->sheetData->row; foreach ($xlrows as $xlrow) { $arr = array(); // In each row, grab it''s value foreach ($xlrow->c as $cell) { $v = (string) $cell->v; // If it has a "t" (type?) of "s" (string?), use the value to look up string value if (isset($cell[''t'']) && $cell[''t''] == ''s'') { $s = array(); $si = $strings->si[(int) $v]; // Register & alias the default namespace or you''ll get empty results in the xpath query $si->registerXPathNamespace(''n'', ''http://schemas.openxmlformats.org/spreadsheetml/2006/main''); // Cat together all of the ''t'' (text?) node values foreach($si->xpath(''.//n:t'') as $t) { $content .= $t." ";} } } } echo $content; file_put_contents($file_name,$content); } //========PPT========== function extractPptText($url,$file_name){ $ppt = file_get_contents($url); file_put_contents ("tempf.ppt", $ppt); $fileHandle = fopen("tempf.ppt", "r"); $line = @fread($fileHandle, filesize("tempf.ppt")); $lines = explode(chr(0x0f),$line); $outtext = ''''; foreach($lines as $thisline) { if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) { $text_line = substr($thisline, 4); $end_pos = strpos($text_line, chr(0x00)); $text_line = substr($text_line, 0, $end_pos); $text_line = preg_replace(''/[^a-zA-Z0-9/s/,/./-/n/r/t@///_/(/)]/''," ",$text_line); $outtext = substr($text_line, 0, $end_pos)."/n".$outtext; } } //echo $outtext; file_put_contents($file_name,$outtext); } //========PPTX========== function extractPptxText($url,$file_name){ $xls = get_url($url); file_put_contents ("tempf.txt", $xls); $zip_handle = new ZipArchive; $output_text = '' ''; if(true === $zip_handle->open("tempf.txt")){ $slide_number = 1; //loop through slide files while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); // these four lines of codes // below were $xml_handle = new DOMDocument (); // added by me in order $xml_handle->preserveWhiteSpace = true; // to preserve space between $xml_handle->formatOutput = true; // each read data $xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text .= $xml_handle->saveXML(); $slide_number++; } if($slide_number == 1){ $output_text .= ""; } $zip_handle->close(); }else{ $output_text .= ""; } echo $output_text; file_put_contents($file_name,$output_text); } /* ========================================================================== ========================================================================= And below is get_url() function: Better than fie_get_contents(); */ function get_url( $url,$timeout = 5 ) { $url = str_replace( "&", "&", urldecode(trim($url)) ); $ch = curl_init(); curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" ); curl_setopt( $ch, CURLOPT_URL, $url ); curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true ); curl_setopt( $ch, CURLOPT_ENCODING, "" ); curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); curl_setopt( $ch, CURLOPT_AUTOREFERER, true ); curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); # required for https urls curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout ); curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout ); curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 ); $content = curl_exec( $ch ); //$response = curl_getinfo( $ch ); curl_close ( $ch ); return $content; }

Puede haber un escenario que necesitemos para obtener el texto de los documentos de Word para el uso futuro para buscar la cadena en el documento cargado por el usuario, como para buscar en CV / currículums, y aparece un problema común sobre cómo obtener el texto, Abrir y leer un documento de Word cargado por el usuario, existen algunos enlaces útiles pero no resuelven todo el problema. Necesitamos obtener el texto en el momento de la carga y guardar el texto en la base de datos y podemos buscar fácilmente dentro de la base de datos.

Aquí hay una clase simple que hace el trabajo correcto para .doc / .docx, PHP docx reader: convierte archivos MS Word Docx a texto .

class DocxConversion{ private $filename; public function __construct($filePath) { $this->filename = $filePath; } private function read_doc() { $fileHandle = fopen($this->filename, "r"); $line = @fread($fileHandle, filesize($this->filename)); $lines = explode(chr(0x0D),$line); $outtext = ""; foreach($lines as $thisline) { $pos = strpos($thisline, chr(0x00)); if (($pos !== FALSE)||(strlen($thisline)==0)) { } else { $outtext .= $thisline." "; } } $outtext = preg_replace("/[^a-zA-Z0-9/s/,/./-/n/r/t@///_/(/)]/","",$outtext); return $outtext; } private function read_docx(){ $striped_content = ''''; $content = ''''; $zip = zip_open($this->filename); if (!$zip || is_numeric($zip)) return false; while ($zip_entry = zip_read($zip)) { if (zip_entry_open($zip, $zip_entry) == FALSE) continue; if (zip_entry_name($zip_entry) != "word/document.xml") continue; $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); zip_entry_close($zip_entry); }// end while zip_close($zip); $content = str_replace(''</w:r></w:p></w:tc><w:tc>'', " ", $content); $content = str_replace(''</w:r></w:p>'', "/r/n", $content); $striped_content = strip_tags($content); return $striped_content; } /************************excel sheet************************************/ function xlsx_to_text($input_file){ $xml_filename = "xl/sharedStrings.xml"; //content file name $zip_handle = new ZipArchive; $output_text = ""; if(true === $zip_handle->open($input_file)){ if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text = strip_tags($xml_handle->saveXML()); }else{ $output_text .=""; } $zip_handle->close(); }else{ $output_text .=""; } return $output_text; } /*************************power point files*****************************/ function pptx_to_text($input_file){ $zip_handle = new ZipArchive; $output_text = ""; if(true === $zip_handle->open($input_file)){ $slide_number = 1; //loop through slide files while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ $xml_datas = $zip_handle->getFromIndex($xml_index); $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text .= strip_tags($xml_handle->saveXML()); $slide_number++; } if($slide_number == 1){ $output_text .=""; } $zip_handle->close(); }else{ $output_text .=""; } return $output_text; } public function convertToText() { if(isset($this->filename) && !file_exists($this->filename)) { return "File Not exists"; } $fileArray = pathinfo($this->filename); $file_ext = $fileArray[''extension'']; if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx") { if($file_ext == "doc") { return $this->read_doc(); } elseif($file_ext == "docx") { return $this->read_docx(); } elseif($file_ext == "xlsx") { return $this->xlsx_to_text(); }elseif($file_ext == "pptx") { return $this->pptx_to_text(); } } else { return "Invalid File Type"; } } }

Document_file_format archivos de Document_file_format son blobs binarios. Se pueden leer con fopen Mientras que los archivos .docx son solo archivos zip y xml. Los archivos xml de un contenedor de archivos zip (wikipedia de origen) puede leerlos usando zip_open .

Uso de clase superior

$docObj = new DocxConversion("test.doc"); //$docObj = new DocxConversion("test.docx"); //$docObj = new DocxConversion("test.xlsx"); //$docObj = new DocxConversion("test.pptx"); echo $docText= $docObj->convertToText();

Del archivo DOC

$filename = ''ypue file''; if ( file_exists($filename) ) { if ( ($fh = fopen($filename, ''r'')) !== false ) { $headers = fread($fh, 0xA00); $n1 = ( ord($headers[0x21C]) - 1 ); $n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 ); $n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 ); $n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 ); $textLength = ($n1 + $n2 + $n3 + $n4); $extracted_plaintext = fread($fh, $textLength); echo nl2br($extracted_plaintext); print_r(extract_emails_from($extracted_plaintext)); } } function extract_emails_from($string) { preg_match_all("/[/._a-zA-Z0-9-]+@[/._a-zA-Z0-9-]+/i", $string, $matches); return $matches[0]; }

De DOCX:

/*Name of the document file*/ $document = ''your file''; /**Function to extract text*/ function extracttext($filename) { //Check for extension $ext = end(explode(''.'', $filename)); //if its docx file if($ext == ''docx'') $dataFile = "word/document.xml"; //else it must be odt file else $dataFile = "content.xml"; //Create a new ZIP archive object $zip = new ZipArchive; // Open the archive file if (true === $zip->open($filename)) { // If successful, search for the data file in the archive if (($index = $zip->locateName($dataFile)) !== false) { // Index found! Now read it to a string $text = $zip->getFromIndex($index); // Load XML from a string // Ignore errors and warnings $xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); // Remove XML formatting tags and return the text return strip_tags($xml->saveXML()); } //Close the archive file $zip->close(); } // In case of failure return a message return "File not found"; } echo extracttext($document);

Para los documentos docx, sugiero el uso de la herramienta docx2txt (disponible al menos en Debian / Ubuntu):

docx2txt < your_file.docx

README explica cómo integrarlo con vim. Agregue a su .vimrc :

" use docx2txt.pl to allow VIm to view the text content of a .docx file directly. autocmd BufReadPre *.docx set ro autocmd BufReadPost *.docx %!docx2txt

(También explica cómo integrar con emacs).

Para los piratas informáticos, esta herramienta está escrita en perl.