Opticon Stockholm is on Tuesday September 10th, hope to see you there!
Opticon Stockholm is on Tuesday September 10th, hope to see you there!
I'm afraid I never found an answer to my question and ended up simply not using EPiServer.Find.Cms.AttachmentFilter.
The customer was content with the hit relevance as it was, even without the filter, so I moved along to the next task at hand. :)
Hi
You could try to install package itextsharp,
And create a parser like this sample code . However, in this solution, we use Steam instead of binary content
/// <summary>
/// Parses a PDF file and extracts the text from content.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
public string ExtractText(Stream s)
{
var sb = new StringBuilder();
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(s);
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var result = ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ";
sb.Append(result);
}
return sb.ToString();
}
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (input[i] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return CleanupContent(resultString);
}
catch
{
return "";
}
}
private string CleanupContent(string text)
{
string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221" };
string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };
for (int i = 0; i < patterns.Length; i++)
{
string regExPattern = patterns[i];
Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
text = regex.Replace(text, replace[i]);
}
return text;
}
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="tokens">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
}
For the example, I created a new content type call PDFMedia and also create new extension that
[ContentType(GUID = "52649275-1EB9-462E-BC0C-6AD3F14A623D")]
[MediaDescriptor(ExtensionString = "pdf")]
[IndexInContentAreas]
public class PDFMedia : MediaData
{ }
public static class PDFMediaExtension
{
public static string SearchText(this PDFMedia pdfMedia)
{
pdfMedia.ValidateNotNullArgument("contentMedia");
if (pdfMedia == null || pdfMedia.BinaryData == null)
{
return null;
}
// var fileExt = Path.GetExtension(pdfMedia.RouteSegment);
using (Stream s = pdfMedia.BinaryData.OpenRead())
{
PDFParser parser = new PDFParser();
return parser.ExtractText(s);
}
}
public static string SearchAttachmentText(this PDFMedia pdfMedia)
{
pdfMedia.ValidateNotNullArgument("contentMedia");
if (pdfMedia == null || pdfMedia.BinaryData == null)
{
return null;
}
// var fileExt = Path.GetExtension(pdfMedia.RouteSegment);
using (Stream s = pdfMedia.BinaryData.OpenRead())
{
PDFParser parser = new PDFParser();
return parser.ExtractText(s);
}
}
}
Finally, register into InitializationModule:
ContentIndexer.Instance.Conventions.ForInstancesOf<PDFMedia>().ShouldIndex(x => true);
SearchClient.Instance.Conventions.ForInstancesOf<PDFMedia>()
.ExcludeField(x => x.SearchAttachment())
.ExcludeField(x => (x as IContentMedia).SearchText())
.ExcludeField(x => (x as IContentMedia).SearchAttachmentText())
.IncludeField(x => x.SearchText())
.IncludeField(x => x.SearchAttachmentText());
That's all. However, the iTextSharp just read text in PDF, it can't translate image into text.
Hope this helps your case.
/Son Do
I faced the same issue recently. Couldn't figure out why SearchAttachmentText and SearchAttachment were missing in the index.
I uninstalled Adobe PDF iFilter and used PDF-XChange Viewer from Tracker Software.
That fixed my issue. All info is now visible in the index.
Blogged about a simple solution i ended up with... https://devblog.gosso.se/2018/11/associating-documents-pdf-to-a-page-pagetype-in-episerver-find/
Using
EPiServer.Find 13.0.1
EPiServer.Find.Cms.AttachmentFilter 13.0.1
And I have installed the Adobe PDF iFilter 64 11.0.01 on my laptop and restarted.
PDF files are stored as a type called TextBasedMedia when uploaded to the Episerver media folder.
When inspecting TextBasedMedia in the index I cant't see any readable content from the PDF-file
SearchAttachmentText$$String is missing
as is SearchAttachment$$Attachment
Shouldn't there be a SearchAttachmentText$$String field at least?
I have cleared and re-indexed my developer index.
I'm able to perform a free-text search and get relevant hits matching text inside the indexed PDF-files using UnifiedSearch.
But as I understand I would get a better search relevancy using the attachment helper (EPiServer.Find.Cms.AttachmentFilter)?
What am I missing?